Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus

author: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com> 2008-02-17 21:51:42 -0500
committer: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com> 2008-02-17 21:51:42 -0500
commit: c58310bf4933986513020fa90b4190c7492995ae (patch)
tree: 143f2c7578d02ebef5db8fc57ae69e951ae0e2ee /kernel
parent: 269cdfaf769f5cd831284cc831790c7c5038040f (diff)
parent: 1309d4e68497184d2fd87e892ddf14076c2bda98 (diff)
73 files changed, 3711 insertions, 2034 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index f2ab70073bd4..ab4f1090f437 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -3,3 +3,4 @@
 #
 config_data.h
 config_data.gz
+timeconst.h
diff --git a/kernel/Makefile b/kernel/Makefile
index db9af707ff5b..6c584c55a6e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,12 +4,12 @@
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            exit.o itimer.o time.o softirq.o resource.o \
-            sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
+            sysctl.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-            hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
+            hrtimer.o rwsem.o nsproxy.o srcu.o \
-            utsname.o notifier.o ksysfs.o
+            notifier.o ksysfs.o pm_qos_params.o
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -42,7 +42,11 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_UTS_NS) += utsname.o
+obj-$(CONFIG_USER_NS) += user_namespace.o
+obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -87,3 +91,11 @@ quiet_cmd_ikconfiggz = IKCFG   $@
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
        $(call if_changed,ikconfiggz)
+$(obj)/time.o: $(obj)/timeconst.h
+quiet_cmd_timeconst  = TIMEC   $@
+      cmd_timeconst  = $(PERL) $< $(CONFIG_HZ) > $@
+targets += timeconst.h
+$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
+        $(call if_changed,timeconst)
diff --git a/kernel/audit.c b/kernel/audit.c
index c8555b180213..2eeea9a14240 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1312,26 +1312,26 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 /* This is a helper-function to print the escaped d_path */
 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
-                      struct dentry *dentry, struct vfsmount *vfsmnt)
+                      struct path *path)
 {
-        char *p, *path;
+        char *p, *pathname;
        if (prefix)
                audit_log_format(ab, " %s", prefix);
        /* We will allow 11 spaces for ' (deleted)' to be appended */
-        path = kmalloc(PATH_MAX+11, ab->gfp_mask);
+        pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
-        if (!path) {
+        if (!pathname) {
                audit_log_format(ab, "<no memory>");
                return;
        }
-        p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
+        p = d_path(path, pathname, PATH_MAX+11);
        if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
                /* FIXME: can we save some information here? */
                audit_log_format(ab, "<too long>");
        } else
                audit_log_untrustedstring(ab, p);
-        kfree(path);
+        kfree(pathname);
 }
 /**
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f4fcf58f20f8..9ef5e0aacc3c 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -549,8 +549,8 @@ void audit_trim_trees(void)
                if (err)
                        goto skip_it;
-                root_mnt = collect_mounts(nd.mnt, nd.dentry);
+                root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
-                path_release(&nd);
+                path_put(&nd.path);
                if (!root_mnt)
                        goto skip_it;
@@ -583,17 +583,17 @@ skip_it:
 static int is_under(struct vfsmount *mnt, struct dentry *dentry,
                    struct nameidata *nd)
 {
-        if (mnt != nd->mnt) {
+        if (mnt != nd->path.mnt) {
                for (;;) {
                        if (mnt->mnt_parent == mnt)
                                return 0;
-                        if (mnt->mnt_parent == nd->mnt)
+                        if (mnt->mnt_parent == nd->path.mnt)
                                        break;
                        mnt = mnt->mnt_parent;
                }
                dentry = mnt->mnt_mountpoint;
        }
-        return is_subdir(dentry, nd->dentry);
+        return is_subdir(dentry, nd->path.dentry);
 }
 int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -641,8 +641,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
        err = path_lookup(tree->pathname, 0, &nd);
        if (err)
                goto Err;
-        mnt = collect_mounts(nd.mnt, nd.dentry);
+        mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
-        path_release(&nd);
+        path_put(&nd.path);
        if (!mnt) {
                err = -ENOMEM;
                goto Err;
@@ -701,8 +701,8 @@ int audit_tag_tree(char *old, char *new)
        err = path_lookup(new, 0, &nd);
        if (err)
                return err;
-        tagged = collect_mounts(nd.mnt, nd.dentry);
+        tagged = collect_mounts(nd.path.mnt, nd.path.dentry);
-        path_release(&nd);
+        path_put(&nd.path);
        if (!tagged)
                return -ENOMEM;
@@ -711,9 +711,9 @@ int audit_tag_tree(char *old, char *new)
                drop_collected_mounts(tagged);
                return err;
        }
-        mnt = mntget(nd.mnt);
+        mnt = mntget(nd.path.mnt);
-        dentry = dget(nd.dentry);
+        dentry = dget(nd.path.dentry);
-        path_release(&nd);
+        path_put(&nd.path);
        if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
                follow_up(&mnt, &dentry);
@@ -744,13 +744,13 @@ int audit_tag_tree(char *old, char *new)
                spin_lock(&vfsmount_lock);
                if (!is_under(mnt, dentry, &nd)) {
                        spin_unlock(&vfsmount_lock);
-                        path_release(&nd);
+                        path_put(&nd.path);
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
                        continue;
                }
                spin_unlock(&vfsmount_lock);
-                path_release(&nd);
+                path_put(&nd.path);
                list_for_each_entry(p, &list, mnt_list) {
                        failed = tag_chunk(p->mnt_root->d_inode, tree);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6f19fd477aac..2f2914b7cc30 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -169,8 +169,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
        inotify_init_watch(&parent->wdata);
        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
        get_inotify_watch(&parent->wdata);
-        wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
+        wd = inotify_add_watch(audit_ih, &parent->wdata,
-                               AUDIT_IN_WATCH);
+                               ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
        if (wd < 0) {
                audit_free_parent(&parent->wdata);
                return ERR_PTR(wd);
@@ -1161,11 +1161,11 @@ static int audit_get_nd(char *path, struct nameidata **ndp,
 static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
 {
        if (ndp) {
-                path_release(ndp);
+                path_put(&ndp->path);
                kfree(ndp);
        }
        if (ndw) {
-                path_release(ndw);
+                path_put(&ndw->path);
                kfree(ndw);
        }
 }
@@ -1214,8 +1214,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
        /* update watch filter fields */
        if (ndw) {
-                watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
+                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-                watch->ino = ndw->dentry->d_inode->i_ino;
+                watch->ino = ndw->path.dentry->d_inode->i_ino;
        }
        /* The audit_filter_mutex must not be held during inotify calls because
@@ -1225,7 +1225,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
         */
        mutex_unlock(&audit_filter_mutex);
-        if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
+        if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+                               &i_watch) < 0) {
                parent = audit_init_parent(ndp);
                if (IS_ERR(parent)) {
                        /* caller expects mutex locked */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c06ecf38d7b..ac6d9b23b018 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -208,8 +208,7 @@ struct audit_context {
        int                 name_count;
        struct audit_names  names[AUDIT_NAMES];
        char *              filterkey;  /* key for rule that triggered record */
-        struct dentry *     pwd;
+        struct path         pwd;
-        struct vfsmount *   pwdmnt;
        struct audit_context *previous; /* For nested syscalls */
        struct audit_aux_data *aux;
        struct audit_aux_data *aux_pids;
@@ -786,12 +785,9 @@ static inline void audit_free_names(struct audit_context *context)
                        __putname(context->names[i].name);
        }
        context->name_count = 0;
-        if (context->pwd)
+        path_put(&context->pwd);
-                dput(context->pwd);
+        context->pwd.dentry = NULL;
-        if (context->pwdmnt)
+        context->pwd.mnt = NULL;
-                mntput(context->pwdmnt);
-        context->pwd = NULL;
-        context->pwdmnt = NULL;
 }
 static inline void audit_free_aux(struct audit_context *context)
@@ -930,8 +926,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
                        if ((vma->vm_flags & VM_EXECUTABLE) &&
                            vma->vm_file) {
                                audit_log_d_path(ab, "exe=",
-                                                 vma->vm_file->f_path.dentry,
+                                                 &vma->vm_file->f_path);
-                                                 vma->vm_file->f_path.mnt);
                                break;
                        }
                        vma = vma->vm_next;
@@ -1341,10 +1336,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                                  context->target_sid, context->target_comm))
                        call_panic = 1;
-        if (context->pwd && context->pwdmnt) {
+        if (context->pwd.dentry && context->pwd.mnt) {
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
                if (ab) {
-                        audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
+                        audit_log_d_path(ab, "cwd=", &context->pwd);
                        audit_log_end(ab);
                }
        }
@@ -1367,8 +1362,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                        case 0:
                                /* name was specified as a relative path and the
                                 * directory component is the cwd */
-                                audit_log_d_path(ab, " name=", context->pwd,
+                                audit_log_d_path(ab, " name=", &context->pwd);
-                                                 context->pwdmnt);
                                break;
                        default:
                                /* log the name's directory component */
@@ -1695,10 +1689,10 @@ void __audit_getname(const char *name)
        context->names[context->name_count].ino  = (unsigned long)-1;
        context->names[context->name_count].osid = 0;
        ++context->name_count;
-        if (!context->pwd) {
+        if (!context->pwd.dentry) {
                read_lock(&current->fs->lock);
-                context->pwd = dget(current->fs->pwd);
+                context->pwd = current->fs->pwd;
-                context->pwdmnt = mntget(current->fs->pwdmnt);
+                path_get(&current->fs->pwd);
                read_unlock(&current->fs->lock);
        }
diff --git a/kernel/capability.c b/kernel/capability.c
index efbd9cdce132..39e8193b41ea 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,6 +22,37 @@
 static DEFINE_SPINLOCK(task_capability_lock);
 /*
+ * Leveraged for setting/resetting capabilities
+ */
+const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
+const kernel_cap_t __cap_full_set = CAP_FULL_SET;
+const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
+EXPORT_SYMBOL(__cap_empty_set);
+EXPORT_SYMBOL(__cap_full_set);
+EXPORT_SYMBOL(__cap_init_eff_set);
+/*
+ * More recent versions of libcap are available from:
+ *
+ *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
+ */
+static void warn_legacy_capability_use(void)
+{
+        static int warned;
+        if (!warned) {
+                char name[sizeof(current->comm)];
+                printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
+                       " (legacy support in use)\n",
+                       get_task_comm(name, current));
+                warned = 1;
+        }
+}
+/*
 * For sys_getproccap() and sys_setproccap(), any of the three
 * capability set pointers may be NULL -- indicating that that set is
 * uninteresting and/or not to be changed.
@@ -42,12 +73,21 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
        pid_t pid;
        __u32 version;
        struct task_struct *target;
-        struct __user_cap_data_struct data;
+        unsigned tocopy;
+        kernel_cap_t pE, pI, pP;
        if (get_user(version, &header->version))
                return -EFAULT;
-        if (version != _LINUX_CAPABILITY_VERSION) {
+        switch (version) {
+        case _LINUX_CAPABILITY_VERSION_1:
+                warn_legacy_capability_use();
+                tocopy = _LINUX_CAPABILITY_U32S_1;
+                break;
+        case _LINUX_CAPABILITY_VERSION_2:
+                tocopy = _LINUX_CAPABILITY_U32S_2;
+                break;
+        default:
                if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
@@ -71,14 +111,47 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
        } else
                target = current;
-        ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
+        ret = security_capget(target, &pE, &pI, &pP);
 out:
        read_unlock(&tasklist_lock);
        spin_unlock(&task_capability_lock);
-        if (!ret && copy_to_user(dataptr, &data, sizeof data))
+        if (!ret) {
-                return -EFAULT;
+                struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+                unsigned i;
+                for (i = 0; i < tocopy; i++) {
+                        kdata[i].effective = pE.cap[i];
+                        kdata[i].permitted = pP.cap[i];
+                        kdata[i].inheritable = pI.cap[i];
+                }
+                /*
+                 * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S,
+                 * we silently drop the upper capabilities here. This
+                 * has the effect of making older libcap
+                 * implementations implicitly drop upper capability
+                 * bits when they perform a: capget/modify/capset
+                 * sequence.
+                 *
+                 * This behavior is considered fail-safe
+                 * behavior. Upgrading the application to a newer
+                 * version of libcap will enable access to the newer
+                 * capabilities.
+                 *
+                 * An alternative would be to return an error here
+                 * (-ERANGE), but that causes legacy applications to
+                 * unexpectidly fail; the capget/modify/capset aborts
+                 * before modification is attempted and the application
+                 * fails.
+                 */
+                if (copy_to_user(dataptr, kdata, tocopy
+                                 * sizeof(struct __user_cap_data_struct))) {
+                        return -EFAULT;
+                }
+        }
        return ret;
 }
@@ -167,6 +240,8 @@ static inline int cap_set_all(kernel_cap_t *effective,
 */
 asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 {
+        struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+        unsigned i, tocopy;
        kernel_cap_t inheritable, permitted, effective;
        __u32 version;
        struct task_struct *target;
@@ -176,7 +251,15 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
        if (get_user(version, &header->version))
                return -EFAULT;
-        if (version != _LINUX_CAPABILITY_VERSION) {
+        switch (version) {
+        case _LINUX_CAPABILITY_VERSION_1:
+                warn_legacy_capability_use();
+                tocopy = _LINUX_CAPABILITY_U32S_1;
+                break;
+        case _LINUX_CAPABILITY_VERSION_2:
+                tocopy = _LINUX_CAPABILITY_U32S_2;
+                break;
+        default:
                if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
@@ -188,10 +271,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
        if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
                return -EPERM;
-        if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
+        if (copy_from_user(&kdata, data, tocopy
-            copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
+                           * sizeof(struct __user_cap_data_struct))) {
-            copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
                return -EFAULT;
+        }
+        for (i = 0; i < tocopy; i++) {
+                effective.cap[i] = kdata[i].effective;
+                permitted.cap[i] = kdata[i].permitted;
+                inheritable.cap[i] = kdata[i].inheritable;
+        }
+        while (i < _LINUX_CAPABILITY_U32S) {
+                effective.cap[i] = 0;
+                permitted.cap[i] = 0;
+                inheritable.cap[i] = 0;
+                i++;
+        }
        spin_lock(&task_capability_lock);
        read_lock(&tasklist_lock);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1a3c23936d43..4766bb65e4d9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -141,7 +141,7 @@ enum {
        ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
 };
-inline int cgroup_is_releasable(const struct cgroup *cgrp)
+static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
        const int bits =
                (1 << CGRP_RELEASABLE) |
@@ -149,7 +149,7 @@ inline int cgroup_is_releasable(const struct cgroup *cgrp)
        return (cgrp->flags & bits) == bits;
 }
-inline int notify_on_release(const struct cgroup *cgrp)
+static int notify_on_release(const struct cgroup *cgrp)
 {
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
- * attach_task() can increment it again.  Because a count of zero
+ * cgroup_attach_task() can increment it again.  Because a count of zero
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
 *      The task_lock() exception
 *
 * The need for this exception arises from the action of
- * attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
 * another.  It does so using cgroup_mutexe, however there are
 * several performance critical places that need to reference
 * task->cgroup without the expense of grabbing a system global
 * mutex.  Therefore except as noted below, when dereferencing or, as
- * in attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
 * the task_struct routinely used for such matters.
 *
 * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cgroup pointer by attach_task()
+ * update of a tasks cgroup pointer by cgroup_attach_task()
 */
 /**
@@ -586,11 +586,27 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
        return inode;
 }
+/*
+ * Call subsys's pre_destroy handler.
+ * This is called before css refcnt check.
+ */
+static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+{
+        struct cgroup_subsys *ss;
+        for_each_subsys(cgrp->root, ss)
+                if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+                        ss->pre_destroy(ss, cgrp);
+        return;
+}
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
        /* is dentry a directory ? if so, kfree() associated cgroup */
        if (S_ISDIR(inode->i_mode)) {
                struct cgroup *cgrp = dentry->d_fsdata;
+                struct cgroup_subsys *ss;
                BUG_ON(!(cgroup_is_removed(cgrp)));
                /* It's possible for external users to be holding css
                 * reference counts on a cgroup; css_put() needs to
@@ -599,6 +615,23 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 * queue the cgroup to be handled by the release
                 * agent */
                synchronize_rcu();
+                mutex_lock(&cgroup_mutex);
+                /*
+                 * Release the subsystem state objects.
+                 */
+                for_each_subsys(cgrp->root, ss) {
+                        if (cgrp->subsys[ss->subsys_id])
+                                ss->destroy(ss, cgrp);
+                }
+                cgrp->root->number_of_cgroups--;
+                mutex_unlock(&cgroup_mutex);
+                /* Drop the active superblock reference that we took when we
+                 * created the cgroup */
+                deactivate_super(cgrp->root->sb);
                kfree(cgrp);
        }
        iput(inode);
@@ -1161,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
 * Call holding cgroup_mutex.  May take task_lock of
 * the task 'pid' during call.
 */
-static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        int retval = 0;
        struct cgroup_subsys *ss;
@@ -1181,9 +1214,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
                        retval = ss->can_attach(ss, cgrp, tsk);
-                        if (retval) {
+                        if (retval)
                                return retval;
-                        }
                }
        }
@@ -1192,9 +1224,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         * based on its final set of cgroups
         */
        newcg = find_css_set(cg, cgrp);
-        if (!newcg) {
+        if (!newcg)
                return -ENOMEM;
-        }
        task_lock(tsk);
        if (tsk->flags & PF_EXITING) {
@@ -1214,9 +1245,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        write_unlock(&css_set_lock);
        for_each_subsys(root, ss) {
-                if (ss->attach) {
+                if (ss->attach)
                        ss->attach(ss, cgrp, oldcgrp, tsk);
-                }
        }
        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
@@ -1239,7 +1269,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
        if (pid) {
                rcu_read_lock();
-                tsk = find_task_by_pid(pid);
+                tsk = find_task_by_vpid(pid);
                if (!tsk || tsk->flags & PF_EXITING) {
                        rcu_read_unlock();
                        return -ESRCH;
@@ -1257,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
                get_task_struct(tsk);
        }
-        ret = attach_task(cgrp, tsk);
+        ret = cgroup_attach_task(cgrp, tsk);
        put_task_struct(tsk);
        return ret;
 }
@@ -1329,9 +1359,14 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
                goto out1;
        }
        buffer[nbytes] = 0;     /* nul-terminate */
+        strstrip(buffer);       /* strip -just- trailing whitespace */
        mutex_lock(&cgroup_mutex);
+        /*
+         * This was already checked for in cgroup_file_write(), but
+         * check again now we're holding cgroup_mutex.
+         */
        if (cgroup_is_removed(cgrp)) {
                retval = -ENODEV;
                goto out2;
@@ -1349,24 +1384,9 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
                        clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
                break;
        case FILE_RELEASE_AGENT:
-        {
+                BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-                struct cgroupfs_root *root = cgrp->root;
+                strcpy(cgrp->root->release_agent_path, buffer);
-                /* Strip trailing newline */
-                if (nbytes && (buffer[nbytes-1] == '\n')) {
-                        buffer[nbytes-1] = 0;
-                }
-                if (nbytes < sizeof(root->release_agent_path)) {
-                        /* We never write anything other than '\0'
-                         * into the last char of release_agent_path,
-                         * so it always remains a NUL-terminated
-                         * string */
-                        strncpy(root->release_agent_path, buffer, nbytes);
-                        root->release_agent_path[nbytes] = 0;
-                } else {
-                        retval = -ENOSPC;
-                }
                break;
-        }
        default:
                retval = -EINVAL;
                goto out2;
@@ -1387,7 +1407,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-        if (!cft)
+        if (!cft || cgroup_is_removed(cgrp))
                return -ENODEV;
        if (cft->write)
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1457,7 +1477,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-        if (!cft)
+        if (!cft || cgroup_is_removed(cgrp))
                return -ENODEV;
        if (cft->read)
@@ -1675,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
        it->task = cg->tasks.next;
 }
+/*
+ * To reduce the fork() overhead for systems that are not actually
+ * using their cgroups capability, we don't maintain the lists running
+ * through each css_set to its tasks until we see the list actually
+ * used - in other words after the first call to cgroup_iter_start().
+ *
+ * The tasklist_lock is not held here, as do_each_thread() and
+ * while_each_thread() are protected by RCU.
+ */
+void cgroup_enable_task_cg_lists(void)
+{
+        struct task_struct *p, *g;
+        write_lock(&css_set_lock);
+        use_task_css_set_links = 1;
+        do_each_thread(g, p) {
+                task_lock(p);
+                if (list_empty(&p->cg_list))
+                        list_add(&p->cg_list, &p->cgroups->tasks);
+                task_unlock(p);
+        } while_each_thread(g, p);
+        write_unlock(&css_set_lock);
+}
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
        /*
@@ -1682,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
         * we need to enable the list linking each css_set to its
         * tasks, and fix up all existing tasks.
         */
-        if (!use_task_css_set_links) {
+        if (!use_task_css_set_links)
-                struct task_struct *p, *g;
+                cgroup_enable_task_cg_lists();
-                write_lock(&css_set_lock);
-                use_task_css_set_links = 1;
-                do_each_thread(g, p) {
-                        task_lock(p);
-                        if (list_empty(&p->cg_list))
-                                list_add(&p->cg_list, &p->cgroups->tasks);
-                        task_unlock(p);
-                } while_each_thread(g, p);
-                write_unlock(&css_set_lock);
-        }
        read_lock(&css_set_lock);
        it->cg_link = &cgrp->css_sets;
        cgroup_advance_iter(cgrp, it);
@@ -1726,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
        read_unlock(&css_set_lock);
 }
+static inline int started_after_time(struct task_struct *t1,
+                                     struct timespec *time,
+                                     struct task_struct *t2)
+{
+        int start_diff = timespec_compare(&t1->start_time, time);
+        if (start_diff > 0) {
+                return 1;
+        } else if (start_diff < 0) {
+                return 0;
+        } else {
+                /*
+                 * Arbitrarily, if two processes started at the same
+                 * time, we'll say that the lower pointer value
+                 * started first. Note that t2 may have exited by now
+                 * so this may not be a valid pointer any longer, but
+                 * that's fine - it still serves to distinguish
+                 * between two tasks started (effectively) simultaneously.
+                 */
+                return t1 > t2;
+        }
+}
+/*
+ * This function is a callback from heap_insert() and is used to order
+ * the heap.
+ * In this case we order the heap in descending task start time.
+ */
+static inline int started_after(void *p1, void *p2)
+{
+        struct task_struct *t1 = p1;
+        struct task_struct *t2 = p2;
+        return started_after_time(t1, &t2->start_time, t2);
+}
+/**
+ * cgroup_scan_tasks - iterate though all the tasks in a cgroup
+ * @scan: struct cgroup_scanner containing arguments for the scan
+ *
+ * Arguments include pointers to callback functions test_task() and
+ * process_task().
+ * Iterate through all the tasks in a cgroup, calling test_task() for each,
+ * and if it returns true, call process_task() for it also.
+ * The test_task pointer may be NULL, meaning always true (select all tasks).
+ * Effectively duplicates cgroup_iter_{start,next,end}()
+ * but does not lock css_set_lock for the call to process_task().
+ * The struct cgroup_scanner may be embedded in any structure of the caller's
+ * creation.
+ * It is guaranteed that process_task() will act on every task that
+ * is a member of the cgroup for the duration of this call. This
+ * function may or may not call process_task() for tasks that exit
+ * or move to a different cgroup during the call, or are forked or
+ * move into the cgroup during the call.
+ *
+ * Note that test_task() may be called with locks held, and may in some
+ * situations be called multiple times for the same task, so it should
+ * be cheap.
+ * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
+ * pre-allocated and will be used for heap operations (and its "gt" member will
+ * be overwritten), else a temporary heap will be used (allocation of which
+ * may cause this function to fail).
+ */
+int cgroup_scan_tasks(struct cgroup_scanner *scan)
+{
+        int retval, i;
+        struct cgroup_iter it;
+        struct task_struct *p, *dropped;
+        /* Never dereference latest_task, since it's not refcounted */
+        struct task_struct *latest_task = NULL;
+        struct ptr_heap tmp_heap;
+        struct ptr_heap *heap;
+        struct timespec latest_time = { 0, 0 };
+        if (scan->heap) {
+                /* The caller supplied our heap and pre-allocated its memory */
+                heap = scan->heap;
+                heap->gt = &started_after;
+        } else {
+                /* We need to allocate our own heap memory */
+                heap = &tmp_heap;
+                retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+                if (retval)
+                        /* cannot allocate the heap */
+                        return retval;
+        }
+ again:
+        /*
+         * Scan tasks in the cgroup, using the scanner's "test_task" callback
+         * to determine which are of interest, and using the scanner's
+         * "process_task" callback to process any of them that need an update.
+         * Since we don't want to hold any locks during the task updates,
+         * gather tasks to be processed in a heap structure.
+         * The heap is sorted by descending task start time.
+         * If the statically-sized heap fills up, we overflow tasks that
+         * started later, and in future iterations only consider tasks that
+         * started after the latest task in the previous pass. This
+         * guarantees forward progress and that we don't miss any tasks.
+         */
+        heap->size = 0;
+        cgroup_iter_start(scan->cg, &it);
+        while ((p = cgroup_iter_next(scan->cg, &it))) {
+                /*
+                 * Only affect tasks that qualify per the caller's callback,
+                 * if he provided one
+                 */
+                if (scan->test_task && !scan->test_task(p, scan))
+                        continue;
+                /*
+                 * Only process tasks that started after the last task
+                 * we processed
+                 */
+                if (!started_after_time(p, &latest_time, latest_task))
+                        continue;
+                dropped = heap_insert(heap, p);
+                if (dropped == NULL) {
+                        /*
+                         * The new task was inserted; the heap wasn't
+                         * previously full
+                         */
+                        get_task_struct(p);
+                } else if (dropped != p) {
+                        /*
+                         * The new task was inserted, and pushed out a
+                         * different task
+                         */
+                        get_task_struct(p);
+                        put_task_struct(dropped);
+                }
+                /*
+                 * Else the new task was newer than anything already in
+                 * the heap and wasn't inserted
+                 */
+        }
+        cgroup_iter_end(scan->cg, &it);
+        if (heap->size) {
+                for (i = 0; i < heap->size; i++) {
+                        struct task_struct *p = heap->ptrs[i];
+                        if (i == 0) {
+                                latest_time = p->start_time;
+                                latest_task = p;
+                        }
+                        /* Process the task per the caller's callback */
+                        scan->process_task(p, scan);
+                        put_task_struct(p);
+                }
+                /*
+                 * If we had to process any tasks at all, scan again
+                 * in case some of them were in the middle of forking
+                 * children that didn't get processed.
+                 * Not the most efficient way to do it, but it avoids
+                 * having to take callback_mutex in the fork path
+                 */
+                goto again;
+        }
+        if (heap == &tmp_heap)
+                heap_free(&tmp_heap);
+        return 0;
+}
 /*
 * Stuff for reading the 'tasks' file.
 *
@@ -1761,7 +1955,7 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
        while ((tsk = cgroup_iter_next(cgrp, &it))) {
                if (unlikely(n == npids))
                        break;
-                pidarray[n++] = task_pid_nr(tsk);
+                pidarray[n++] = task_pid_vnr(tsk);
        }
        cgroup_iter_end(cgrp, &it);
        return n;
@@ -2126,9 +2320,8 @@ static inline int cgroup_has_css_refs(struct cgroup *cgrp)
                 * matter, since it can only happen if the cgroup
                 * has been deleted and hence no longer needs the
                 * release agent to be called anyway. */
-                if (css && atomic_read(&css->refcnt)) {
+                if (css && atomic_read(&css->refcnt))
                        return 1;
-                }
        }
        return 0;
 }
@@ -2138,7 +2331,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct cgroup *cgrp = dentry->d_fsdata;
        struct dentry *d;
        struct cgroup *parent;
-        struct cgroup_subsys *ss;
        struct super_block *sb;
        struct cgroupfs_root *root;
@@ -2157,17 +2349,19 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        parent = cgrp->parent;
        root = cgrp->root;
        sb = root->sb;
+        /*
+         * Call pre_destroy handlers of subsys
+         */
+        cgroup_call_pre_destroy(cgrp);
+        /*
+         * Notify subsyses that rmdir() request comes.
+         */
        if (cgroup_has_css_refs(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
-        for_each_subsys(root, ss) {
-                if (cgrp->subsys[ss->subsys_id])
-                        ss->destroy(ss, cgrp);
-        }
        spin_lock(&release_list_lock);
        set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
@@ -2182,15 +2376,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        cgroup_d_remove_dir(d);
        dput(d);
-        root->number_of_cgroups--;
        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
        mutex_unlock(&cgroup_mutex);
-        /* Drop the active superblock reference that we took when we
-         * created the cgroup */
-        deactivate_super(sb);
        return 0;
 }
@@ -2324,7 +2514,7 @@ out:
 *  - Used for /proc/<pid>/cgroup.
 *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
 *    doesn't really matter if tsk->cgroup changes after we read it,
- *    and we take cgroup_mutex, keeping attach_task() from changing it
+ *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
 *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
 *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
 *    cgroup to top_cgroup.
@@ -2435,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
 * A pointer to the shared css_set was automatically copied in
 * fork.c by dup_task_struct().  However, we ignore that copy, since
 * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer.  attach_task() might
+ * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
 * have already changed current->cgroups, allowing the previously
 * referenced cgroup group to be removed and freed.
 *
@@ -2514,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
 *    attach us to a different cgroup, decrementing the count on
 *    the first cgroup that we never incremented.  But in this case,
 *    top_cgroup isn't going away, and either task has PF_EXITING set,
- *    which wards off any attach_task() attempts, or task is a failed
+ *    which wards off any cgroup_attach_task() attempts, or task is a failed
- *    fork, never visible to attach_task.
+ *    fork, never visible to cgroup_attach_task.
 *
 */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2655,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
        }
        /* All seems fine. Finish by moving the task into the new cgroup */
-        ret = attach_task(child, tsk);
+        ret = cgroup_attach_task(child, tsk);
        mutex_unlock(&cgroup_mutex);
 out_release:
diff --git a/kernel/compat.c b/kernel/compat.c
index 42a1ed4b61b1..5f0e201bcfd3 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -40,10 +40,35 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
                        __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
+static long compat_nanosleep_restart(struct restart_block *restart)
+{
+        struct compat_timespec __user *rmtp;
+        struct timespec rmt;
+        mm_segment_t oldfs;
+        long ret;
+        rmtp = (struct compat_timespec __user *)(restart->arg1);
+        restart->arg1 = (unsigned long)&rmt;
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+        ret = hrtimer_nanosleep_restart(restart);
+        set_fs(oldfs);
+        if (ret) {
+                restart->arg1 = (unsigned long)rmtp;
+                if (rmtp && put_compat_timespec(&rmt, rmtp))
+                        return -EFAULT;
+        }
+        return ret;
+}
 asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
                                     struct compat_timespec __user *rmtp)
 {
        struct timespec tu, rmt;
+        mm_segment_t oldfs;
        long ret;
        if (get_compat_timespec(&tu, rqtp))
@@ -52,11 +77,21 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
        if (!timespec_valid(&tu))
                return -EINVAL;
-        ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
+        oldfs = get_fs();
-                                CLOCK_MONOTONIC);
+        set_fs(KERNEL_DS);
+        ret = hrtimer_nanosleep(&tu,
+                                rmtp ? (struct timespec __user *)&rmt : NULL,
+                                HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+        set_fs(oldfs);
+        if (ret) {
+                struct restart_block *restart
+                        = &current_thread_info()->restart_block;
+                restart->fn = compat_nanosleep_restart;
+                restart->arg1 = (unsigned long)rmtp;
-        if (ret && rmtp) {
+                if (rmtp && put_compat_timespec(&rmt, rmtp))
-                if (put_compat_timespec(&rmt, rmtp))
                        return -EFAULT;
        }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e0d3a4f56ecb..2eff3f63abed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -389,7 +389,7 @@ int disable_nonboot_cpus(void)
        return error;
 }
-void enable_nonboot_cpus(void)
+void __ref enable_nonboot_cpus(void)
 {
        int cpu, error;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d817..3e296ed81d4d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,7 +38,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
-#include <linux/prio_heap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@@ -56,6 +55,8 @@
 #include <asm/atomic.h>
 #include <linux/mutex.h>
 #include <linux/kfifo.h>
+#include <linux/workqueue.h>
+#include <linux/cgroup.h>
 /*
 * Tracks how many cpusets are currently defined in system.
@@ -64,7 +65,7 @@
 */
 int number_of_cpusets __read_mostly;
-/* Retrieve the cpuset from a cgroup */
+/* Forward declare cgroup structures */
 struct cgroup_subsys cpuset_subsys;
 struct cpuset;
@@ -96,6 +97,9 @@ struct cpuset {
        /* partition number for rebuild_sched_domains() */
        int pn;
+        /* used for walking a cpuset heirarchy */
+        struct list_head stack_list;
 };
 /* Retrieve the cpuset for a cgroup */
@@ -111,7 +115,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
        return container_of(task_subsys_state(task, cpuset_subsys_id),
                            struct cpuset, css);
 }
+struct cpuset_hotplug_scanner {
+        struct cgroup_scanner scan;
+        struct cgroup *to;
+};
 /* bits in struct cpuset flags field */
 typedef enum {
@@ -160,17 +167,17 @@ static inline int is_spread_slab(const struct cpuset *cs)
 * number, and avoid having to lock and reload mems_allowed unless
 * the cpuset they're using changes generation.
 *
- * A single, global generation is needed because attach_task() could
+ * A single, global generation is needed because cpuset_attach_task() could
 * reattach a task to a different cpuset, which must not have its
 * generation numbers aliased with those of that tasks previous cpuset.
 *
 * Generations are needed for mems_allowed because one task cannot
- * modify anothers memory placement.  So we must enable every task,
+ * modify another's memory placement.  So we must enable every task,
 * on every visit to __alloc_pages(), to efficiently check whether
 * its current->cpuset->mems_allowed has changed, requiring an update
 * of its current->mems_allowed.
 *
- * Since cpuset_mems_generation is guarded by manage_mutex,
+ * Since writes to cpuset_mems_generation are guarded by the cgroup lock
 * there is no need to mark it atomic.
 */
 static int cpuset_mems_generation;
@@ -182,17 +189,20 @@ static struct cpuset top_cpuset = {
 };
 /*
- * We have two global cpuset mutexes below.  They can nest.
+ * There are two global mutexes guarding cpuset structures.  The first
- * It is ok to first take manage_mutex, then nest callback_mutex.  We also
+ * is the main control groups cgroup_mutex, accessed via
- * require taking task_lock() when dereferencing a tasks cpuset pointer.
+ * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
- * See "The task_lock() exception", at the end of this comment.
+ * callback_mutex, below. They can nest.  It is ok to first take
+ * cgroup_mutex, then nest callback_mutex.  We also require taking
+ * task_lock() when dereferencing a task's cpuset pointer.  See "The
+ * task_lock() exception", at the end of this comment.
 *
 * A task must hold both mutexes to modify cpusets.  If a task
- * holds manage_mutex, then it blocks others wanting that mutex,
+ * holds cgroup_mutex, then it blocks others wanting that mutex,
 * ensuring that it is the only task able to also acquire callback_mutex
 * and be able to modify cpusets.  It can perform various checks on
 * the cpuset structure first, knowing nothing will change.  It can
- * also allocate memory while just holding manage_mutex.  While it is
+ * also allocate memory while just holding cgroup_mutex.  While it is
 * performing these checks, various callback routines can briefly
 * acquire callback_mutex to query cpusets.  Once it is ready to make
 * the changes, it takes callback_mutex, blocking everyone else.
@@ -208,60 +218,16 @@ static struct cpuset top_cpuset = {
 * The task_struct fields mems_allowed and mems_generation may only
 * be accessed in the context of that task, so require no locks.
 *
- * Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_mutex or callback_mutex can't rely
- * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both mutexes, can
- * increment it again.  Because a count of zero means that no tasks
- * are currently attached, therefore there is no way a task attached
- * to that cpuset can fork (the other way to increment the count).
- * So code holding manage_mutex or callback_mutex can safely assume that
- * if the count is zero, it will stay zero.  Similarly, if a task
- * holds manage_mutex or callback_mutex on a cpuset with zero count, it
- * knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those mutexes.
- *
 * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_mutex across the entire operation,
+ * the cpuset hierarchy holds cgroup_mutex across the entire operation,
 * single threading all such cpuset modifications across the system.
 *
 * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
- * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
+ * Accessing a task's cpuset should be done in accordance with the
- * (usually) take either mutex.  These are the two most performance
+ * guidelines for accessing subsystem state in kernel/cgroup.c
- * critical pieces of code here.  The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits.  Then manage_mutex
- * is taken, and if the cpuset count is zero, a usermode call made
- * to /sbin/cpuset_release_agent with the name of the cpuset (path
- * relative to the root of cpuset file system) as the argument.
- *
- * A cpuset can only be deleted if both its 'count' of using tasks
- * is zero, and its list of 'children' cpusets is empty.  Since all
- * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init), therefore, top_cpuset
- * always has either children cpusets and/or using tasks.  So we don't
- * need a special hack to ensure that top_cpuset cannot be deleted.
- *
- * The above "Tale of Two Semaphores" would be complete, but for:
- *
- *      The task_lock() exception
- *
- * The need for this exception arises from the action of attach_task(),
- * which overwrites one tasks cpuset pointer with another.  It does
- * so using both mutexes, however there are several performance
- * critical places that need to reference task->cpuset without the
- * expense of grabbing a system global mutex.  Therefore except as
- * noted below, when dereferencing or, as in attach_task(), modifying
- * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
- * (task->alloc_lock) already in the task_struct routinely used for
- * such matters.
- *
- * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cpuset pointer by attach_task() and the
- * access of task->cpuset->mems_generation via that pointer in
- * the routine cpuset_update_task_memory_state().
 */
 static DEFINE_MUTEX(callback_mutex);
@@ -354,15 +320,14 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 * Do not call this routine if in_interrupt().
 *
 * Call without callback_mutex or task_lock() held.  May be
- * called with or without manage_mutex held.  Thanks in part to
+ * called with or without cgroup_mutex held.  Thanks in part to
- * 'the_top_cpuset_hack', the tasks cpuset pointer will never
+ * 'the_top_cpuset_hack', the task's cpuset pointer will never
 * be NULL.  This routine also might acquire callback_mutex and
 * current->mm->mmap_sem during call.
 *
 * Reading current->cpuset->mems_generation doesn't need task_lock
 * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset by attach_task(),
+ * from concurrent freeing of current->cpuset using RCU.
- * using RCU.
 *
 * The rcu_dereference() is technically probably not needed,
 * as I don't actually mind if I see a new cpuset pointer but
@@ -424,7 +389,7 @@ void cpuset_update_task_memory_state(void)
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding manage_mutex.
+ * are only set if the other's are set.  Call holding cgroup_mutex.
 */
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -442,7 +407,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
- * manage_mutex held.
+ * cgroup_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
@@ -476,7 +441,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        if (!is_cpuset_subset(trial, par))
                return -EACCES;
-        /* If either I or some sibling (!= me) is exclusive, we can't overlap */
+        /*
+         * If either I or some sibling (!= me) is exclusive, we can't
+         * overlap
+         */
        list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
                c = cgroup_cs(cont);
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
@@ -732,22 +700,50 @@ static inline int started_after(void *p1, void *p2)
        return started_after_time(t1, &t2->start_time, t2);
 }
-/*
+/**
- * Call with manage_mutex held.  May take callback_mutex during call.
+ * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
+ * @tsk: task to test
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Called for each task in a cgroup by cgroup_scan_tasks().
+ * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
+ * words, if its mask is not equal to its cpuset's mask).
+ */
+int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+        return !cpus_equal(tsk->cpus_allowed,
+                        (cgroup_cs(scan->cg))->cpus_allowed);
+}
+/**
+ * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
+ * @tsk: task to test
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup whose
+ * cpus_allowed mask needs to be changed.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
 */
+void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+        set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed);
+}
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
        struct cpuset trialcs;
-        int retval, i;
+        struct cgroup_scanner scan;
-        int is_load_balanced;
-        struct cgroup_iter it;
-        struct cgroup *cgrp = cs->css.cgroup;
-        struct task_struct *p, *dropped;
-        /* Never dereference latest_task, since it's not refcounted */
-        struct task_struct *latest_task = NULL;
        struct ptr_heap heap;
-        struct timespec latest_time = { 0, 0 };
+        int retval;
+        int is_load_balanced;
        /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
        if (cs == &top_cpuset)
@@ -756,7 +752,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        trialcs = *cs;
        /*
-         * An empty cpus_allowed is ok iff there are no tasks in the cpuset.
+         * An empty cpus_allowed is ok only if the cpuset has no tasks.
         * Since cpulist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
@@ -777,6 +773,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        /* Nothing to do if the cpus didn't change */
        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
                return 0;
        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
        if (retval)
                return retval;
@@ -787,62 +784,19 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        cs->cpus_allowed = trialcs.cpus_allowed;
        mutex_unlock(&callback_mutex);
- again:
        /*
         * Scan tasks in the cpuset, and update the cpumasks of any
-         * that need an update. Since we can't call set_cpus_allowed()
+         * that need an update.
-         * while holding tasklist_lock, gather tasks to be processed
-         * in a heap structure. If the statically-sized heap fills up,
-         * overflow tasks that started later, and in future iterations
-         * only consider tasks that started after the latest task in
-         * the previous pass. This guarantees forward progress and
-         * that we don't miss any tasks
         */
-        heap.size = 0;
+        scan.cg = cs->css.cgroup;
-        cgroup_iter_start(cgrp, &it);
+        scan.test_task = cpuset_test_cpumask;
-        while ((p = cgroup_iter_next(cgrp, &it))) {
+        scan.process_task = cpuset_change_cpumask;
-                /* Only affect tasks that don't have the right cpus_allowed */
+        scan.heap = &heap;
-                if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
+        cgroup_scan_tasks(&scan);
-                        continue;
-                /*
-                 * Only process tasks that started after the last task
-                 * we processed
-                 */
-                if (!started_after_time(p, &latest_time, latest_task))
-                        continue;
-                dropped = heap_insert(&heap, p);
-                if (dropped == NULL) {
-                        get_task_struct(p);
-                } else if (dropped != p) {
-                        get_task_struct(p);
-                        put_task_struct(dropped);
-                }
-        }
-        cgroup_iter_end(cgrp, &it);
-        if (heap.size) {
-                for (i = 0; i < heap.size; i++) {
-                        struct task_struct *p = heap.ptrs[i];
-                        if (i == 0) {
-                                latest_time = p->start_time;
-                                latest_task = p;
-                        }
-                        set_cpus_allowed(p, cs->cpus_allowed);
-                        put_task_struct(p);
-                }
-                /*
-                 * If we had to process any tasks at all, scan again
-                 * in case some of them were in the middle of forking
-                 * children that didn't notice the new cpumask
-                 * restriction.  Not the most efficient way to do it,
-                 * but it avoids having to take callback_mutex in the
-                 * fork path
-                 */
-                goto again;
-        }
        heap_free(&heap);
        if (is_load_balanced)
                rebuild_sched_domains();
        return 0;
 }
@@ -854,11 +808,11 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
 *    so that the migration code can allocate pages on these nodes.
 *
- *    Call holding manage_mutex, so our current->cpuset won't change
+ *    Call holding cgroup_mutex, so current's cpuset won't change
- *    during this call, as manage_mutex holds off any attach_task()
+ *    during this call, as manage_mutex holds off any cpuset_attach()
 *    calls.  Therefore we don't need to take task_lock around the
 *    call to guarantee_online_mems(), as we know no one is changing
- *    our tasks cpuset.
+ *    our task's cpuset.
 *
 *    Hold callback_mutex around the two modifications of our tasks
 *    mems_allowed to synchronize with cpuset_mems_allowed().
@@ -903,7 +857,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 * the cpuset is marked 'memory_migrate', migrate the tasks
 * pages to the new memory.
 *
- * Call with manage_mutex held.  May take callback_mutex during call.
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
@@ -1016,7 +970,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
         * tasklist_lock.  Forks can happen again now - the mpol_copy()
         * cpuset_being_rebound check will catch such forks, and rebind
         * their vma mempolicies too.  Because we still hold the global
-         * cpuset manage_mutex, we know that no other rebind effort will
+         * cgroup_mutex, we know that no other rebind effort will
         * be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -1031,7 +985,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                mmput(mm);
        }
-        /* We're done rebinding vma's to this cpusets new mems_allowed. */
+        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
        kfree(mmarray);
        cpuset_being_rebound = NULL;
        retval = 0;
@@ -1045,7 +999,7 @@ int current_cpuset_is_being_rebound(void)
 }
 /*
- * Call with manage_mutex held.
+ * Call with cgroup_mutex held.
 */
 static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -1066,7 +1020,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
 * cs:  the cpuset to update
 * buf: the buffer where we read the 0 or 1
 *
- * Call with manage_mutex held.
+ * Call with cgroup_mutex held.
 */
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -1200,6 +1154,7 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss,
                             struct cgroup *cont, struct task_struct *tsk)
 {
@@ -1547,7 +1502,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 * If this becomes a problem for some users who wish to
 * allow that scenario, then cpuset_post_clone() could be
 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
+ * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
+ * held.
 */
 static void cpuset_post_clone(struct cgroup_subsys *ss,
                              struct cgroup *cgroup)
@@ -1571,11 +1527,8 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
 /*
 *      cpuset_create - create a cpuset
- *      parent: cpuset that will be parent of the new cpuset.
+ *      ss:     cpuset cgroup subsystem
- *      name:           name of the new cpuset. Will be strcpy'ed.
+ *      cont:   control group that the new cpuset will be part of
- *      mode:           mode to set on new inode
- *
- *      Must be called with the mutex on the parent inode held
 */
 static struct cgroup_subsys_state *cpuset_create(
@@ -1687,53 +1640,140 @@ int __init cpuset_init(void)
        return 0;
 }
+/**
+ * cpuset_do_move_task - move a given task to another cpuset
+ * @tsk: pointer to task_struct the task to move
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ * Return nonzero to stop the walk through the tasks.
+ */
+void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+        struct cpuset_hotplug_scanner *chsp;
+        chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
+        cgroup_attach_task(chsp->to, tsk);
+}
+/**
+ * move_member_tasks_to_cpuset - move tasks from one cpuset to another
+ * @from: cpuset in which the tasks currently reside
+ * @to: cpuset to which the tasks will be moved
+ *
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+        struct cpuset_hotplug_scanner scan;
+        scan.scan.cg = from->css.cgroup;
+        scan.scan.test_task = NULL; /* select all tasks in cgroup */
+        scan.scan.process_task = cpuset_do_move_task;
+        scan.scan.heap = NULL;
+        scan.to = to->css.cgroup;
+        if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+                printk(KERN_ERR "move_member_tasks_to_cpuset: "
+                                "cgroup_scan_tasks failed\n");
+}
 /*
 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * last CPU or node from a cpuset, then move the tasks in the empty
- * or guarantee_online_mems() code will use that emptied cpusets
+ * cpuset to its next-highest non-empty parent.
- * parent online CPUs or nodes.  Cpusets that were already empty of
+ *
- * CPUs or nodes are left empty.
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+        struct cpuset *parent;
+        /*
+         * The cgroup's css_sets list is in use if there are tasks
+         * in the cpuset; the list is empty if there are none;
+         * the cs->css.refcnt seems always 0.
+         */
+        if (list_empty(&cs->css.cgroup->css_sets))
+                return;
+        /*
+         * Find its next-highest non-empty parent, (top cpuset
+         * has online cpus, so can't be empty).
+         */
+        parent = cs->parent;
+        while (cpus_empty(parent->cpus_allowed) ||
+                        nodes_empty(parent->mems_allowed))
+                parent = parent->parent;
+        move_member_tasks_to_cpuset(cs, parent);
+}
+/*
+ * Walk the specified cpuset subtree and look for empty cpusets.
+ * The tasks of such cpuset must be moved to a parent cpuset.
 *
- * This routine is intentionally inefficient in a couple of regards.
+ * Called with cgroup_mutex held.  We take callback_mutex to modify
- * It will check all cpusets in a subtree even if the top cpuset of
+ * cpus_allowed and mems_allowed.
- * the subtree has no offline CPUs or nodes.  It checks both CPUs and
- * nodes, even though the caller could have been coded to know that
- * only one of CPUs or nodes needed to be checked on a given call.
- * This was done to minimize text size rather than cpu cycles.
 *
- * Call with both manage_mutex and callback_mutex held.
+ * This walk processes the tree from top to bottom, completing one layer
+ * before dropping down to the next.  It always processes a node before
+ * any of its children.
 *
- * Recursive, on depth of cpuset subtree.
+ * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * that has tasks along with an empty 'mems'.  But if we did see such
+ * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
+static void scan_for_empty_cpusets(const struct cpuset *root)
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
 {
+        struct cpuset *cp;      /* scans cpusets being updated */
+        struct cpuset *child;   /* scans child cpusets of cp */
+        struct list_head queue;
        struct cgroup *cont;
-        struct cpuset *c;
-        /* Each of our child cpusets mems must be online */
+        INIT_LIST_HEAD(&queue);
-        list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
-                c = cgroup_cs(cont);
+        list_add_tail((struct list_head *)&root->stack_list, &queue);
-                guarantee_online_cpus_mems_in_subtree(c);
-                if (!cpus_empty(c->cpus_allowed))
+        while (!list_empty(&queue)) {
-                        guarantee_online_cpus(c, &c->cpus_allowed);
+                cp = container_of(queue.next, struct cpuset, stack_list);
-                if (!nodes_empty(c->mems_allowed))
+                list_del(queue.next);
-                        guarantee_online_mems(c, &c->mems_allowed);
+                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                        child = cgroup_cs(cont);
+                        list_add_tail(&child->stack_list, &queue);
+                }
+                cont = cp->css.cgroup;
+                /* Continue past cpusets with all cpus, mems online */
+                if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
+                        continue;
+                /* Remove offline cpus and mems from this cpuset. */
+                mutex_lock(&callback_mutex);
+                cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+                nodes_and(cp->mems_allowed, cp->mems_allowed,
+                                                node_states[N_HIGH_MEMORY]);
+                mutex_unlock(&callback_mutex);
+                /* Move tasks from the empty cpuset to a parent */
+                if (cpus_empty(cp->cpus_allowed) ||
+                     nodes_empty(cp->mems_allowed))
+                        remove_tasks_in_empty_cpuset(cp);
        }
 }
 /*
 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
 * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug
+ * track what's online after any CPU or memory node hotplug or unplug event.
- * event.
- *
- * To ensure that we don't remove a CPU or node from the top cpuset
- * that is currently in use by a child cpuset (which would violate
- * the rule that cpusets must be subsets of their parent), we first
- * call the recursive routine guarantee_online_cpus_mems_in_subtree().
 *
 * Since there are two callers of this routine, one for CPU hotplug
 * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1784,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
 static void common_cpu_mem_hotplug_unplug(void)
 {
        cgroup_lock();
-        mutex_lock(&callback_mutex);
-        guarantee_online_cpus_mems_in_subtree(&top_cpuset);
        top_cpuset.cpus_allowed = cpu_online_map;
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+        scan_for_empty_cpusets(&top_cpuset);
-        mutex_unlock(&callback_mutex);
        cgroup_unlock();
 }
@@ -1826,7 +1864,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 /**
 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be  called with callback_mutex held.
+ * Must be called with callback_mutex held.
 **/
 cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
 {
@@ -2163,10 +2201,8 @@ void __cpuset_memory_pressure_bump(void)
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take manage_mutex, keeping attach_task() from changing it
+ *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
- *    anyway.  No need to check that tsk->cpuset != NULL, thanks to
+ *    anyway.
- *    the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
- *    cpuset to top_cpuset.
 */
 static int proc_cpuset_show(struct seq_file *m, void *unused_v)
 {
@@ -2219,13 +2255,14 @@ const struct file_operations proc_cpuset_operations = {
 #endif /* CONFIG_PROC_PID_CPUSET */
 /* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
-char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
+void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
-{
+{
-        buffer += sprintf(buffer, "Cpus_allowed:\t");
+        seq_printf(m, "Cpus_allowed:\t");
-        buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
+        m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
-        buffer += sprintf(buffer, "\n");
+                                        task->cpus_allowed);
-        buffer += sprintf(buffer, "Mems_allowed:\t");
+        seq_printf(m, "\n");
-        buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
+        seq_printf(m, "Mems_allowed:\t");
-        buffer += sprintf(buffer, "\n");
+        m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
-        return buffer;
+                                        task->mems_allowed);
+        seq_printf(m, "\n");
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e459fefda77..506a957b665a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -293,26 +293,27 @@ static void reparent_to_kthreadd(void)
        switch_uid(INIT_USER);
 }
-void __set_special_pids(pid_t session, pid_t pgrp)
+void __set_special_pids(struct pid *pid)
 {
        struct task_struct *curr = current->group_leader;
+        pid_t nr = pid_nr(pid);
-        if (task_session_nr(curr) != session) {
+        if (task_session(curr) != pid) {
                detach_pid(curr, PIDTYPE_SID);
-                set_task_session(curr, session);
+                attach_pid(curr, PIDTYPE_SID, pid);
-                attach_pid(curr, PIDTYPE_SID, find_pid(session));
+                set_task_session(curr, nr);
        }
-        if (task_pgrp_nr(curr) != pgrp) {
+        if (task_pgrp(curr) != pid) {
                detach_pid(curr, PIDTYPE_PGID);
-                set_task_pgrp(curr, pgrp);
+                attach_pid(curr, PIDTYPE_PGID, pid);
-                attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp));
+                set_task_pgrp(curr, nr);
        }
 }
-static void set_special_pids(pid_t session, pid_t pgrp)
+static void set_special_pids(struct pid *pid)
 {
        write_lock_irq(&tasklist_lock);
-        __set_special_pids(session, pgrp);
+        __set_special_pids(pid);
        write_unlock_irq(&tasklist_lock);
 }
@@ -383,7 +384,11 @@ void daemonize(const char *name, ...)
         */
        current->flags |= PF_NOFREEZE;
-        set_special_pids(1, 1);
+        if (current->nsproxy != &init_nsproxy) {
+                get_nsproxy(&init_nsproxy);
+                switch_task_namespaces(current, &init_nsproxy);
+        }
+        set_special_pids(&init_struct_pid);
        proc_clear_tty(current);
        /* Block and flush all signals */
@@ -398,11 +403,6 @@ void daemonize(const char *name, ...)
        current->fs = fs;
        atomic_inc(&fs->count);
-        if (current->nsproxy != init_task.nsproxy) {
-                get_nsproxy(init_task.nsproxy);
-                switch_task_namespaces(current, init_task.nsproxy);
-        }
        exit_files(current);
        current->files = init_task.files;
        atomic_inc(&current->files->count);
@@ -458,7 +458,7 @@ struct files_struct *get_files_struct(struct task_struct *task)
        return files;
 }
-void fastcall put_files_struct(struct files_struct *files)
+void put_files_struct(struct files_struct *files)
 {
        struct fdtable *fdt;
@@ -512,14 +512,10 @@ static void __put_fs_struct(struct fs_struct *fs)
 {
        /* No need to hold fs->lock if we are killing it */
        if (atomic_dec_and_test(&fs->count)) {
-                dput(fs->root);
+                path_put(&fs->root);
-                mntput(fs->rootmnt);
+                path_put(&fs->pwd);
-                dput(fs->pwd);
+                if (fs->altroot.dentry)
-                mntput(fs->pwdmnt);
+                        path_put(&fs->altroot);
-                if (fs->altroot) {
-                        dput(fs->altroot);
-                        mntput(fs->altrootmnt);
-                }
                kmem_cache_free(fs_cachep, fs);
        }
 }
@@ -745,24 +741,6 @@ static void exit_notify(struct task_struct *tsk)
        struct task_struct *t;
        struct pid *pgrp;
-        if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
-            && !thread_group_empty(tsk)) {
-                /*
-                 * This occurs when there was a race between our exit
-                 * syscall and a group signal choosing us as the one to
-                 * wake up.  It could be that we are the only thread
-                 * alerted to check for pending signals, but another thread
-                 * should be woken now to take the signal since we will not.
-                 * Now we'll wake all the threads in the group just to make
-                 * sure someone gets all the pending signals.
-                 */
-                spin_lock_irq(&tsk->sighand->siglock);
-                for (t = next_thread(tsk); t != tsk; t = next_thread(t))
-                        if (!signal_pending(t) && !(t->flags & PF_EXITING))
-                                recalc_sigpending_and_wake(t);
-                spin_unlock_irq(&tsk->sighand->siglock);
-        }
        /*
         * This does two things:
         *
@@ -905,7 +883,7 @@ static inline void exit_child_reaper(struct task_struct *tsk)
        zap_pid_ns_processes(tsk->nsproxy->pid_ns);
 }
-fastcall NORET_TYPE void do_exit(long code)
+NORET_TYPE void do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
@@ -947,7 +925,7 @@ fastcall NORET_TYPE void do_exit(long code)
                schedule();
        }
-        tsk->flags |= PF_EXITING;
+        exit_signals(tsk);  /* sets PF_EXITING */
        /*
         * tsk->flags are checked in the futex code to protect against
         * an exiting task cleaning up the robust pi futexes.
@@ -1083,11 +1061,12 @@ do_group_exit(int exit_code)
                struct signal_struct *const sig = current->signal;
                struct sighand_struct *const sighand = current->sighand;
                spin_lock_irq(&sighand->siglock);
-                if (sig->flags & SIGNAL_GROUP_EXIT)
+                if (signal_group_exit(sig))
                        /* Another thread got here before we took the lock.  */
                        exit_code = sig->group_exit_code;
                else {
                        sig->group_exit_code = exit_code;
+                        sig->flags = SIGNAL_GROUP_EXIT;
                        zap_other_threads(current);
                }
                spin_unlock_irq(&sighand->siglock);
@@ -1107,20 +1086,23 @@ asmlinkage void sys_exit_group(int error_code)
        do_group_exit((error_code & 0xff) << 8);
 }
-static int eligible_child(pid_t pid, int options, struct task_struct *p)
+static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
+{
+        struct pid *pid = NULL;
+        if (type == PIDTYPE_PID)
+                pid = task->pids[type].pid;
+        else if (type < PIDTYPE_MAX)
+                pid = task->group_leader->pids[type].pid;
+        return pid;
+}
+static int eligible_child(enum pid_type type, struct pid *pid, int options,
+                          struct task_struct *p)
 {
        int err;
-        struct pid_namespace *ns;
-        ns = current->nsproxy->pid_ns;
+        if (type < PIDTYPE_MAX) {
-        if (pid > 0) {
+                if (task_pid_type(p, type) != pid)
-                if (task_pid_nr_ns(p, ns) != pid)
-                        return 0;
-        } else if (!pid) {
-                if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current))
-                        return 0;
-        } else if (pid != -1) {
-                if (task_pgrp_nr_ns(p, ns) != -pid)
                        return 0;
        }
@@ -1139,18 +1121,16 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
        if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
            && !(options & __WALL))
                return 0;
-        /*
-         * Do not consider thread group leaders that are
-         * in a non-empty thread group:
-         */
-        if (delay_group_leader(p))
-                return 2;
        err = security_task_wait(p);
-        if (err)
+        if (likely(!err))
-                return err;
+                return 1;
-        return 1;
+        if (type != PIDTYPE_PID)
+                return 0;
+        /* This child was explicitly requested, abort */
+        read_unlock(&tasklist_lock);
+        return err;
 }
 static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1190,20 +1170,13 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 {
        unsigned long state;
        int retval, status, traced;
-        struct pid_namespace *ns;
+        pid_t pid = task_pid_vnr(p);
-        ns = current->nsproxy->pid_ns;
        if (unlikely(noreap)) {
-                pid_t pid = task_pid_nr_ns(p, ns);
                uid_t uid = p->uid;
                int exit_code = p->exit_code;
                int why, status;
-                if (unlikely(p->exit_state != EXIT_ZOMBIE))
-                        return 0;
-                if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
-                        return 0;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
                if ((exit_code & 0x7f) == 0) {
@@ -1314,11 +1287,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
                        retval = put_user(status, &infop->si_status);
        }
        if (!retval && infop)
-                retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
+                retval = put_user(pid, &infop->si_pid);
        if (!retval && infop)
                retval = put_user(p->uid, &infop->si_uid);
        if (!retval)
-                retval = task_pid_nr_ns(p, ns);
+                retval = pid;
        if (traced) {
                write_lock_irq(&tasklist_lock);
@@ -1350,21 +1323,38 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
-static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+static int wait_task_stopped(struct task_struct *p,
                             int noreap, struct siginfo __user *infop,
                             int __user *stat_addr, struct rusage __user *ru)
 {
-        int retval, exit_code;
+        int retval, exit_code, why;
+        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;
-        if (!p->exit_code)
+        exit_code = 0;
-                return 0;
+        spin_lock_irq(&p->sighand->siglock);
-        if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
-            p->signal->group_stop_count > 0)
+        if (unlikely(!task_is_stopped_or_traced(p)))
+                goto unlock_sig;
+        if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0)
                /*
                 * A group stop is in progress and this is the group leader.
                 * We won't report until all threads have stopped.
                 */
+                goto unlock_sig;
+        exit_code = p->exit_code;
+        if (!exit_code)
+                goto unlock_sig;
+        if (!noreap)
+                p->exit_code = 0;
+        uid = p->uid;
+unlock_sig:
+        spin_unlock_irq(&p->sighand->siglock);
+        if (!exit_code)
                return 0;
        /*
@@ -1374,65 +1364,15 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
         * keep holding onto the tasklist_lock while we call getrusage and
         * possibly take page faults for user memory.
         */
-        pid = task_pid_nr_ns(p, current->nsproxy->pid_ns);
        get_task_struct(p);
+        pid = task_pid_vnr(p);
+        why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
        read_unlock(&tasklist_lock);
-        if (unlikely(noreap)) {
+        if (unlikely(noreap))
-                uid_t uid = p->uid;
-                int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
-                exit_code = p->exit_code;
-                if (unlikely(!exit_code) || unlikely(p->exit_state))
-                        goto bail_ref;
                return wait_noreap_copyout(p, pid, uid,
                                           why, exit_code,
                                           infop, ru);
-        }
-        write_lock_irq(&tasklist_lock);
-        /*
-         * This uses xchg to be atomic with the thread resuming and setting
-         * it.  It must also be done with the write lock held to prevent a
-         * race with the EXIT_ZOMBIE case.
-         */
-        exit_code = xchg(&p->exit_code, 0);
-        if (unlikely(p->exit_state)) {
-                /*
-                 * The task resumed and then died.  Let the next iteration
-                 * catch it in EXIT_ZOMBIE.  Note that exit_code might
-                 * already be zero here if it resumed and did _exit(0).
-                 * The task itself is dead and won't touch exit_code again;
-                 * other processors in this function are locked out.
-                 */
-                p->exit_code = exit_code;
-                exit_code = 0;
-        }
-        if (unlikely(exit_code == 0)) {
-                /*
-                 * Another thread in this function got to it first, or it
-                 * resumed, or it resumed and then died.
-                 */
-                write_unlock_irq(&tasklist_lock);
-bail_ref:
-                put_task_struct(p);
-                /*
-                 * We are returning to the wait loop without having successfully
-                 * removed the process and having released the lock. We cannot
-                 * continue, since the "p" task pointer is potentially stale.
-                 *
-                 * Return -EAGAIN, and do_wait() will restart the loop from the
-                 * beginning. Do _not_ re-acquire the lock.
-                 */
-                return -EAGAIN;
-        }
-        /* move to end of parent's list to avoid starvation */
-        remove_parent(p);
-        add_parent(p);
-        write_unlock_irq(&tasklist_lock);
        retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
        if (!retval && stat_addr)
@@ -1442,15 +1382,13 @@ bail_ref:
        if (!retval && infop)
                retval = put_user(0, &infop->si_errno);
        if (!retval && infop)
-                retval = put_user((short)((p->ptrace & PT_PTRACED)
+                retval = put_user(why, &infop->si_code);
-                                          ? CLD_TRAPPED : CLD_STOPPED),
-                                  &infop->si_code);
        if (!retval && infop)
                retval = put_user(exit_code, &infop->si_status);
        if (!retval && infop)
                retval = put_user(pid, &infop->si_pid);
        if (!retval && infop)
-                retval = put_user(p->uid, &infop->si_uid);
+                retval = put_user(uid, &infop->si_uid);
        if (!retval)
                retval = pid;
        put_task_struct(p);
@@ -1472,7 +1410,6 @@ static int wait_task_continued(struct task_struct *p, int noreap,
        int retval;
        pid_t pid;
        uid_t uid;
-        struct pid_namespace *ns;
        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
                return 0;
@@ -1487,8 +1424,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
        spin_unlock_irq(&p->sighand->siglock);
-        ns = current->nsproxy->pid_ns;
+        pid = task_pid_vnr(p);
-        pid = task_pid_nr_ns(p, ns);
        uid = p->uid;
        get_task_struct(p);
        read_unlock(&tasklist_lock);
@@ -1499,7 +1435,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
                if (!retval && stat_addr)
                        retval = put_user(0xffff, stat_addr);
                if (!retval)
-                        retval = task_pid_nr_ns(p, ns);
+                        retval = pid;
        } else {
                retval = wait_noreap_copyout(p, pid, uid,
                                             CLD_CONTINUED, SIGCONT,
@@ -1510,103 +1446,63 @@ static int wait_task_continued(struct task_struct *p, int noreap,
        return retval;
 }
+static long do_wait(enum pid_type type, struct pid *pid, int options,
-static inline int my_ptrace_child(struct task_struct *p)
+                    struct siginfo __user *infop, int __user *stat_addr,
-{
+                    struct rusage __user *ru)
-        if (!(p->ptrace & PT_PTRACED))
-                return 0;
-        if (!(p->ptrace & PT_ATTACHED))
-                return 1;
-        /*
-         * This child was PTRACE_ATTACH'd.  We should be seeing it only if
-         * we are the attacher.  If we are the real parent, this is a race
-         * inside ptrace_attach.  It is waiting for the tasklist_lock,
-         * which we have to switch the parent links, but has already set
-         * the flags in p->ptrace.
-         */
-        return (p->parent != p->real_parent);
-}
-static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
-                    int __user *stat_addr, struct rusage __user *ru)
 {
        DECLARE_WAITQUEUE(wait, current);
        struct task_struct *tsk;
        int flag, retval;
-        int allowed, denied;
        add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
+        /* If there is nothing that can match our critier just get out */
+        retval = -ECHILD;
+        if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
+                goto end;
        /*
         * We will set this flag if we see any child that might later
         * match our criteria, even if we are not able to reap it yet.
         */
-        flag = 0;
+        flag = retval = 0;
-        allowed = denied = 0;
        current->state = TASK_INTERRUPTIBLE;
        read_lock(&tasklist_lock);
        tsk = current;
        do {
                struct task_struct *p;
-                int ret;
                list_for_each_entry(p, &tsk->children, sibling) {
-                        ret = eligible_child(pid, options, p);
+                        int ret = eligible_child(type, pid, options, p);
                        if (!ret)
                                continue;
                        if (unlikely(ret < 0)) {
-                                denied = ret;
+                                retval = ret;
-                                continue;
+                        } else if (task_is_stopped_or_traced(p)) {
-                        }
-                        allowed = 1;
-                        if (task_is_stopped_or_traced(p)) {
                                /*
                                 * It's stopped now, so it might later
                                 * continue, exit, or stop again.
-                                 *
-                                 * When we hit the race with PTRACE_ATTACH, we
-                                 * will not report this child.  But the race
-                                 * means it has not yet been moved to our
-                                 * ptrace_children list, so we need to set the
-                                 * flag here to avoid a spurious ECHILD when
-                                 * the race happens with the only child.
                                 */
                                flag = 1;
+                                if (!(p->ptrace & PT_PTRACED) &&
+                                    !(options & WUNTRACED))
+                                        continue;
-                                if (!my_ptrace_child(p)) {
+                                retval = wait_task_stopped(p,
-                                        if (task_is_traced(p))
-                                                continue;
-                                        if (!(options & WUNTRACED))
-                                                continue;
-                                }
-                                retval = wait_task_stopped(p, ret == 2,
                                                (options & WNOWAIT), infop,
                                                stat_addr, ru);
-                                if (retval == -EAGAIN)
+                        } else if (p->exit_state == EXIT_ZOMBIE &&
-                                        goto repeat;
+                                        !delay_group_leader(p)) {
-                                if (retval != 0) /* He released the lock.  */
-                                        goto end;
-                        } else if (p->exit_state == EXIT_DEAD) {
-                                continue;
-                        } else if (p->exit_state == EXIT_ZOMBIE) {
                                /*
-                                 * Eligible but we cannot release it yet:
+                                 * We don't reap group leaders with subthreads.
                                 */
-                                if (ret == 2)
-                                        goto check_continued;
                                if (!likely(options & WEXITED))
                                        continue;
                                retval = wait_task_zombie(p,
                                                (options & WNOWAIT), infop,
                                                stat_addr, ru);
-                                /* He released the lock.  */
+                        } else if (p->exit_state != EXIT_DEAD) {
-                                if (retval != 0)
-                                        goto end;
-                        } else {
-check_continued:
                                /*
                                 * It's running now, so it might later
                                 * exit, stop, or stop and then continue.
@@ -1617,17 +1513,20 @@ check_continued:
                                retval = wait_task_continued(p,
                                                (options & WNOWAIT), infop,
                                                stat_addr, ru);
-                                if (retval != 0) /* He released the lock.  */
-                                        goto end;
                        }
+                        if (retval != 0) /* tasklist_lock released */
+                                goto end;
                }
                if (!flag) {
                        list_for_each_entry(p, &tsk->ptrace_children,
-                                            ptrace_list) {
+                                                                ptrace_list) {
-                                if (!eligible_child(pid, options, p))
+                                flag = eligible_child(type, pid, options, p);
+                                if (!flag)
                                        continue;
-                                flag = 1;
+                                if (likely(flag > 0))
-                                break;
+                                        break;
+                                retval = flag;
+                                goto end;
                        }
                }
                if (options & __WNOTHREAD)
@@ -1635,10 +1534,9 @@ check_continued:
                tsk = next_thread(tsk);
                BUG_ON(tsk->signal != current->signal);
        } while (tsk != current);
        read_unlock(&tasklist_lock);
        if (flag) {
-                retval = 0;
                if (options & WNOHANG)
                        goto end;
                retval = -ERESTARTSYS;
@@ -1648,14 +1546,12 @@ check_continued:
                goto repeat;
        }
        retval = -ECHILD;
-        if (unlikely(denied) && !allowed)
-                retval = denied;
 end:
        current->state = TASK_RUNNING;
        remove_wait_queue(&current->signal->wait_chldexit,&wait);
        if (infop) {
                if (retval > 0)
-                retval = 0;
+                        retval = 0;
                else {
                        /*
                         * For a WNOHANG return, clear out all the fields
@@ -1679,10 +1575,12 @@ end:
        return retval;
 }
-asmlinkage long sys_waitid(int which, pid_t pid,
+asmlinkage long sys_waitid(int which, pid_t upid,
                           struct siginfo __user *infop, int options,
                           struct rusage __user *ru)
 {
+        struct pid *pid = NULL;
+        enum pid_type type;
        long ret;
        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
@@ -1692,37 +1590,58 @@ asmlinkage long sys_waitid(int which, pid_t pid,
        switch (which) {
        case P_ALL:
-                pid = -1;
+                type = PIDTYPE_MAX;
                break;
        case P_PID:
-                if (pid <= 0)
+                type = PIDTYPE_PID;
+                if (upid <= 0)
                        return -EINVAL;
                break;
        case P_PGID:
-                if (pid <= 0)
+                type = PIDTYPE_PGID;
+                if (upid <= 0)
                        return -EINVAL;
-                pid = -pid;
                break;
        default:
                return -EINVAL;
        }
-        ret = do_wait(pid, options, infop, NULL, ru);
+        if (type < PIDTYPE_MAX)
+                pid = find_get_pid(upid);
+        ret = do_wait(type, pid, options, infop, NULL, ru);
+        put_pid(pid);
        /* avoid REGPARM breakage on x86: */
        prevent_tail_call(ret);
        return ret;
 }
-asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
+asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
                          int options, struct rusage __user *ru)
 {
+        struct pid *pid = NULL;
+        enum pid_type type;
        long ret;
        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;
-        ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+        if (upid == -1)
+                type = PIDTYPE_MAX;
+        else if (upid < 0) {
+                type = PIDTYPE_PGID;
+                pid = find_get_pid(-upid);
+        } else if (upid == 0) {
+                type = PIDTYPE_PGID;
+                pid = get_pid(task_pgrp(current));
+        } else /* upid > 0 */ {
+                type = PIDTYPE_PID;
+                pid = find_get_pid(upid);
+        }
+        ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+        put_pid(pid);
        /* avoid REGPARM breakage on x86: */
        prevent_tail_call(ret);
diff --git a/kernel/fork.c b/kernel/fork.c
index 05e0b6f4365b..dd249c37b3a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
+#include <linux/memcontrol.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
@@ -325,7 +326,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
 static inline void mm_free_pgd(struct mm_struct * mm)
 {
-        pgd_free(mm->pgd);
+        pgd_free(mm, mm->pgd);
 }
 #else
 #define dup_mmap(mm, oldmm)     (0)
@@ -340,7 +341,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 #include <linux/init_task.h>
-static struct mm_struct * mm_init(struct mm_struct * mm)
+static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
@@ -357,11 +358,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
        mm->ioctx_list = NULL;
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
+        mm_init_cgroup(mm, p);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
                return mm;
        }
+        mm_free_cgroup(mm);
        free_mm(mm);
        return NULL;
 }
@@ -376,7 +380,7 @@ struct mm_struct * mm_alloc(void)
        mm = allocate_mm();
        if (mm) {
                memset(mm, 0, sizeof(*mm));
-                mm = mm_init(mm);
+                mm = mm_init(mm, current);
        }
        return mm;
 }
@@ -386,10 +390,11 @@ struct mm_struct * mm_alloc(void)
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
-void fastcall __mmdrop(struct mm_struct *mm)
+void __mmdrop(struct mm_struct *mm)
 {
        BUG_ON(mm == &init_mm);
        mm_free_pgd(mm);
+        mm_free_cgroup(mm);
        destroy_context(mm);
        free_mm(mm);
 }
@@ -511,7 +516,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
        mm->token_priority = 0;
        mm->last_interval = 0;
-        if (!mm_init(mm))
+        if (!mm_init(mm, tsk))
                goto fail_nomem;
        if (init_new_context(tsk, mm))
@@ -595,16 +600,16 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
                rwlock_init(&fs->lock);
                fs->umask = old->umask;
                read_lock(&old->lock);
-                fs->rootmnt = mntget(old->rootmnt);
+                fs->root = old->root;
-                fs->root = dget(old->root);
+                path_get(&old->root);
-                fs->pwdmnt = mntget(old->pwdmnt);
+                fs->pwd = old->pwd;
-                fs->pwd = dget(old->pwd);
+                path_get(&old->pwd);
-                if (old->altroot) {
+                if (old->altroot.dentry) {
-                        fs->altrootmnt = mntget(old->altrootmnt);
+                        fs->altroot = old->altroot;
-                        fs->altroot = dget(old->altroot);
+                        path_get(&old->altroot);
                } else {
-                        fs->altrootmnt = NULL;
+                        fs->altroot.mnt = NULL;
-                        fs->altroot = NULL;
+                        fs->altroot.dentry = NULL;
                }
                read_unlock(&old->lock);
        }
@@ -904,7 +909,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
-        sig->tsk = tsk;
        sig->it_virt_expires = cputime_zero;
        sig->it_virt_incr = cputime_zero;
@@ -1118,6 +1122,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_SECURITY
        p->security = NULL;
 #endif
+        p->cap_bset = current->cap_bset;
        p->io_context = NULL;
        p->audit_context = NULL;
        cgroup_fork(p);
@@ -1332,6 +1337,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        if (clone_flags & CLONE_NEWPID)
                                p->nsproxy->pid_ns->child_reaper = p;
+                        p->signal->leader_pid = pid;
                        p->signal->tty = current->signal->tty;
                        set_task_pgrp(p, task_pgrp_nr(current));
                        set_task_session(p, task_session_nr(current));
@@ -1398,7 +1404,7 @@ fork_out:
        return ERR_PTR(retval);
 }
-noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
 {
        memset(regs, 0, sizeof(struct pt_regs));
        return regs;
@@ -1450,6 +1456,23 @@ long do_fork(unsigned long clone_flags,
        int trace = 0;
        long nr;
+        /*
+         * We hope to recycle these flags after 2.6.26
+         */
+        if (unlikely(clone_flags & CLONE_STOPPED)) {
+                static int __read_mostly count = 100;
+                if (count > 0 && printk_ratelimit()) {
+                        char comm[TASK_COMM_LEN];
+                        count--;
+                        printk(KERN_INFO "fork(): process `%s' used deprecated "
+                                        "clone flags 0x%lx\n",
+                                get_task_comm(comm, current),
+                                clone_flags & CLONE_STOPPED);
+                }
+        }
        if (unlikely(current->ptrace)) {
                trace = fork_traceflag (clone_flags);
                if (trace)
@@ -1465,13 +1488,7 @@ long do_fork(unsigned long clone_flags,
        if (!IS_ERR(p)) {
                struct completion vfork;
-                /*
+                nr = task_pid_vnr(p);
-                 * this is enough to call pid_nr_ns here, but this if
-                 * improves optimisation of regular fork()
-                 */
-                nr = (clone_flags & CLONE_NEWPID) ?
-                        task_pid_nr_ns(p, current->nsproxy->pid_ns) :
-                                task_pid_vnr(p);
                if (clone_flags & CLONE_PARENT_SETTID)
                        put_user(nr, parent_tidptr);
@@ -1492,7 +1509,7 @@ long do_fork(unsigned long clone_flags,
                if (!(clone_flags & CLONE_STOPPED))
                        wake_up_new_task(p, clone_flags);
                else
-                        p->state = TASK_STOPPED;
+                        __set_task_state(p, TASK_STOPPED);
                if (unlikely (trace)) {
                        current->ptrace_message = nr;
diff --git a/kernel/futex.c b/kernel/futex.c
index a6baaec44b8f..221f2128a437 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2116,7 +2116,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
                t = timespec_to_ktime(ts);
                if (cmd == FUTEX_WAIT)
-                        t = ktime_add(ktime_get(), t);
+                        t = ktime_add_safe(ktime_get(), t);
                tp = &t;
        }
        /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 133d558db452..7d5e4b016f39 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -176,7 +176,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
                t = timespec_to_ktime(ts);
                if (cmd == FUTEX_WAIT)
-                        t = ktime_add(ktime_get(), t);
+                        t = ktime_add_safe(ktime_get(), t);
                tp = &t;
        }
        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1069998fe25f..98bee013f71f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -306,7 +306,7 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns);
 /*
 * Divide a ktime value by a nanosecond value
 */
-unsigned long ktime_divns(const ktime_t kt, s64 div)
+u64 ktime_divns(const ktime_t kt, s64 div)
 {
        u64 dclc, inc, dns;
        int sft = 0;
@@ -321,11 +321,28 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
        dclc >>= sft;
        do_div(dclc, (unsigned long) div);
-        return (unsigned long) dclc;
+        return dclc;
 }
 #endif /* BITS_PER_LONG >= 64 */
 /*
+ * Add two ktime values and do a safety check for overflow:
+ */
+ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
+{
+        ktime_t res = ktime_add(lhs, rhs);
+        /*
+         * We use KTIME_SEC_MAX here, the maximum timeout which we can
+         * return to user space in a timespec:
+         */
+        if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+                res = ktime_set(KTIME_SEC_MAX, 0);
+        return res;
+}
+/*
 * Check, whether the timer is on the callback pending list
 */
 static inline int hrtimer_cb_pending(const struct hrtimer *timer)
@@ -425,6 +442,8 @@ static int hrtimer_reprogram(struct hrtimer *timer,
        ktime_t expires = ktime_sub(timer->expires, base->offset);
        int res;
+        WARN_ON_ONCE(timer->expires.tv64 < 0);
        /*
         * When the callback is running, we do not reprogram the clock event
         * device. The timer callback is either running on a different CPU or
@@ -435,6 +454,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
        if (hrtimer_callback_running(timer))
                return 0;
+        /*
+         * CLOCK_REALTIME timer might be requested with an absolute
+         * expiry time which is less than base->offset. Nothing wrong
+         * about that, just avoid to call into the tick code, which
+         * has now objections against negative expiry values.
+         */
+        if (expires.tv64 < 0)
+                return -ETIME;
        if (expires.tv64 >= expires_next->tv64)
                return 0;
@@ -656,10 +684,9 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 * Forward the timer expiry so it will expire in the future.
 * Returns the number of overruns.
 */
-unsigned long
+u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
-hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
-        unsigned long orun = 1;
+        u64 orun = 1;
        ktime_t delta;
        delta = ktime_sub(now, timer->expires);
@@ -683,13 +710,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
                 */
                orun++;
        }
-        timer->expires = ktime_add(timer->expires, interval);
+        timer->expires = ktime_add_safe(timer->expires, interval);
-        /*
-         * Make sure, that the result did not wrap with a very large
-         * interval.
-         */
-        if (timer->expires.tv64 < 0)
-                timer->expires = ktime_set(KTIME_SEC_MAX, 0);
        return orun;
 }
@@ -840,7 +861,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        new_base = switch_hrtimer_base(timer, base);
        if (mode == HRTIMER_MODE_REL) {
-                tim = ktime_add(tim, new_base->get_time());
+                tim = ktime_add_safe(tim, new_base->get_time());
                /*
                 * CONFIG_TIME_LOW_RES is a temporary way for architectures
                 * to signal that they simply return xtime in
@@ -849,16 +870,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
                 * timeouts. This will go away with the GTOD framework.
                 */
 #ifdef CONFIG_TIME_LOW_RES
-                tim = ktime_add(tim, base->resolution);
+                tim = ktime_add_safe(tim, base->resolution);
 #endif
-                /*
-                 * Careful here: User space might have asked for a
-                 * very long sleep, so the add above might result in a
-                 * negative number, which enqueues the timer in front
-                 * of the queue.
-                 */
-                if (tim.tv64 < 0)
-                        tim.tv64 = KTIME_MAX;
        }
        timer->expires = tim;
@@ -1320,13 +1333,26 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
        return t->task == NULL;
 }
+static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
+{
+        struct timespec rmt;
+        ktime_t rem;
+        rem = ktime_sub(timer->expires, timer->base->get_time());
+        if (rem.tv64 <= 0)
+                return 0;
+        rmt = ktime_to_timespec(rem);
+        if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+                return -EFAULT;
+        return 1;
+}
 long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
        struct hrtimer_sleeper t;
-        struct timespec *rmtp;
+        struct timespec __user  *rmtp;
-        ktime_t time;
-        restart->fn = do_no_restart_syscall;
        hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
        t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
@@ -1334,26 +1360,22 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
        if (do_nanosleep(&t, HRTIMER_MODE_ABS))
                return 0;
-        rmtp = (struct timespec *)restart->arg1;
+        rmtp = (struct timespec __user *)restart->arg1;
        if (rmtp) {
-                time = ktime_sub(t.timer.expires, t.timer.base->get_time());
+                int ret = update_rmtp(&t.timer, rmtp);
-                if (time.tv64 <= 0)
+                if (ret <= 0)
-                        return 0;
+                        return ret;
-                *rmtp = ktime_to_timespec(time);
        }
-        restart->fn = hrtimer_nanosleep_restart;
        /* The other values in restart are already filled in */
        return -ERESTART_RESTARTBLOCK;
 }
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
                       const enum hrtimer_mode mode, const clockid_t clockid)
 {
        struct restart_block *restart;
        struct hrtimer_sleeper t;
-        ktime_t rem;
        hrtimer_init(&t.timer, clockid, mode);
        t.timer.expires = timespec_to_ktime(*rqtp);
@@ -1365,10 +1387,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
                return -ERESTARTNOHAND;
        if (rmtp) {
-                rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
+                int ret = update_rmtp(&t.timer, rmtp);
-                if (rem.tv64 <= 0)
+                if (ret <= 0)
-                        return 0;
+                        return ret;
-                *rmtp = ktime_to_timespec(rem);
        }
        restart = &current_thread_info()->restart_block;
@@ -1384,8 +1405,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
 asmlinkage long
 sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
 {
-        struct timespec tu, rmt;
+        struct timespec tu;
-        int ret;
        if (copy_from_user(&tu, rqtp, sizeof(tu)))
                return -EFAULT;
@@ -1393,15 +1413,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
        if (!timespec_valid(&tu))
                return -EINVAL;
-        ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
+        return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
-                                CLOCK_MONOTONIC);
-        if (ret && rmtp) {
-                if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
-                        return -EFAULT;
-        }
-        return ret;
 }
 /*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 44019ce30a14..cc54c6276356 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -286,7 +286,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
 *      Note: The caller is expected to handle the ack, clear, mask and
 *      unmask issues if necessary.
 */
-void fastcall
+void
 handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
        struct irqaction *action;
@@ -327,7 +327,7 @@ out_unlock:
 *      it after the associated handler has acknowledged the device, so the
 *      interrupt line is back to inactive.
 */
-void fastcall
+void
 handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
        unsigned int cpu = smp_processor_id();
@@ -375,7 +375,7 @@ out_unlock:
 *      for modern forms of interrupt handlers, which handle the flow
 *      details in hardware, transparently.
 */
-void fastcall
+void
 handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
        unsigned int cpu = smp_processor_id();
@@ -434,7 +434,7 @@ out:
 *      the handler was running. If all pending interrupts are handled, the
 *      loop is left.
 */
-void fastcall
+void
 handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
        const unsigned int cpu = smp_processor_id();
@@ -505,7 +505,7 @@ out_unlock:
 *
 *      Per CPU interrupts on SMP machines without locking requirements
 */
-void fastcall
+void
 handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
        irqreturn_t action_ret;
@@ -589,3 +589,39 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
        set_irq_chip(irq, chip);
        __set_irq_handler(irq, handle, 0, name);
 }
+void __init set_irq_noprobe(unsigned int irq)
+{
+        struct irq_desc *desc;
+        unsigned long flags;
+        if (irq >= NR_IRQS) {
+                printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
+                return;
+        }
+        desc = irq_desc + irq;
+        spin_lock_irqsave(&desc->lock, flags);
+        desc->status |= IRQ_NOPROBE;
+        spin_unlock_irqrestore(&desc->lock, flags);
+}
+void __init set_irq_probe(unsigned int irq)
+{
+        struct irq_desc *desc;
+        unsigned long flags;
+        if (irq >= NR_IRQS) {
+                printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
+                return;
+        }
+        desc = irq_desc + irq;
+        spin_lock_irqsave(&desc->lock, flags);
+        desc->status &= ~IRQ_NOPROBE;
+        spin_unlock_irqrestore(&desc->lock, flags);
+}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index dc335ad27525..5fa6198e9139 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -25,7 +25,7 @@
 *
 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
 */
-void fastcall
+void
 handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 {
        print_irq_desc(irq, desc);
@@ -163,7 +163,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 * This is the original x86 implementation which is used for every
 * interrupt type.
 */
-fastcall unsigned int __do_IRQ(unsigned int irq)
+unsigned int __do_IRQ(unsigned int irq)
 {
        struct irq_desc *desc = irq_desc + irq;
        struct irqaction *action;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 2fab344dbf56..ab982747d9bd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -132,7 +132,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
        struct signal_struct *sig =
                container_of(timer, struct signal_struct, real_timer);
-        send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
+        kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
        return HRTIMER_NORESTART;
 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7dadc71ce516..f091d13def00 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -53,14 +53,6 @@ static inline int is_kernel_inittext(unsigned long addr)
        return 0;
 }
-static inline int is_kernel_extratext(unsigned long addr)
-{
-        if (addr >= (unsigned long)_sextratext
-            && addr <= (unsigned long)_eextratext)
-                return 1;
-        return 0;
-}
 static inline int is_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
@@ -80,8 +72,7 @@ static int is_ksym_addr(unsigned long addr)
        if (all_var)
                return is_kernel(addr);
-        return is_kernel_text(addr) || is_kernel_inittext(addr) ||
+        return is_kernel_text(addr) || is_kernel_inittext(addr);
-                is_kernel_extratext(addr);
 }
 /* expand a compressed symbol data into the resulting uncompressed string,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a26eec9eb04..06a0e2775651 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1361,8 +1361,8 @@ unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
 static int __init crash_save_vmcoreinfo_init(void)
 {
-        vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release);
+        VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-        vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE);
+        VMCOREINFO_PAGESIZE(PAGE_SIZE);
        VMCOREINFO_SYMBOL(init_uts_ns);
        VMCOREINFO_SYMBOL(node_online_map);
@@ -1376,15 +1376,15 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_SPARSEMEM
        VMCOREINFO_SYMBOL(mem_section);
        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-        VMCOREINFO_SIZE(mem_section);
+        VMCOREINFO_STRUCT_SIZE(mem_section);
        VMCOREINFO_OFFSET(mem_section, section_mem_map);
 #endif
-        VMCOREINFO_SIZE(page);
+        VMCOREINFO_STRUCT_SIZE(page);
-        VMCOREINFO_SIZE(pglist_data);
+        VMCOREINFO_STRUCT_SIZE(pglist_data);
-        VMCOREINFO_SIZE(zone);
+        VMCOREINFO_STRUCT_SIZE(zone);
-        VMCOREINFO_SIZE(free_area);
+        VMCOREINFO_STRUCT_SIZE(free_area);
-        VMCOREINFO_SIZE(list_head);
+        VMCOREINFO_STRUCT_SIZE(list_head);
-        VMCOREINFO_TYPEDEF_SIZE(nodemask_t);
+        VMCOREINFO_SIZE(nodemask_t);
        VMCOREINFO_OFFSET(page, flags);
        VMCOREINFO_OFFSET(page, _count);
        VMCOREINFO_OFFSET(page, mapping);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb7df2a28bd7..22be3ff3f363 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -173,10 +173,7 @@ static int ____call_usermodehelper(void *data)
         */
        set_user_nice(current, 0);
-        retval = -EPERM;
+        retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
-        if (current->fs->root)
-                retval = kernel_execve(sub_info->path,
-                                sub_info->argv, sub_info->envp);
        /* Exec failed? */
        sub_info->retval = retval;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d0493eafea3e..7a86e6432338 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -699,6 +699,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                                 struct kretprobe_instance, uflist);
                ri->rp = rp;
                ri->task = current;
+                if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+                        spin_unlock_irqrestore(&kretprobe_lock, flags);
+                        return 0;
+                }
                arch_prepare_kretprobe(ri, regs);
                /* XXX(hch): why is there no hlist_move_head? */
@@ -745,7 +751,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        INIT_HLIST_HEAD(&rp->used_instances);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
-                inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
+                inst = kmalloc(sizeof(struct kretprobe_instance) +
+                               rp->data_size, GFP_KERNEL);
                if (inst == NULL) {
                        free_rp_inst(rp);
                        return -ENOMEM;
diff --git a/kernel/latency.c b/kernel/latency.c
deleted file mode 100644
index e63fcacb61a7..000000000000
--- a/kernel/latency.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * latency.c: Explicit system-wide latency-expectation infrastructure
- *
- * The purpose of this infrastructure is to allow device drivers to set
- * latency constraint they have and to collect and summarize these
- * expectations globally. The cummulated result can then be used by
- * power management and similar users to make decisions that have
- * tradoffs with a latency component.
- *
- * An example user of this are the x86 C-states; each higher C state saves
- * more power, but has a higher exit latency. For the idle loop power
- * code to make a good decision which C-state to use, information about
- * acceptable latencies is required.
- *
- * An example announcer of latency is an audio driver that knowns it
- * will get an interrupt when the hardware has 200 usec of samples
- * left in the DMA buffer; in that case the driver can set a latency
- * constraint of, say, 150 usec.
- *
- * Multiple drivers can each announce their maximum accepted latency,
- * to keep these appart, a string based identifier is used.
- *
- *
- * (C) Copyright 2006 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/latency.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <asm/atomic.h>
-struct latency_info {
-        struct list_head list;
-        int usecs;
-        char *identifier;
-};
-/*
- * locking rule: all modifications to current_max_latency and
- * latency_list need to be done while holding the latency_lock.
- * latency_lock needs to be taken _irqsave.
- */
-static atomic_t current_max_latency;
-static DEFINE_SPINLOCK(latency_lock);
-static LIST_HEAD(latency_list);
-static BLOCKING_NOTIFIER_HEAD(latency_notifier);
-/*
- * This function returns the maximum latency allowed, which
- * happens to be the minimum of all maximum latencies on the
- * list.
- */
-static int __find_max_latency(void)
-{
-        int min = INFINITE_LATENCY;
-        struct latency_info *info;
-        list_for_each_entry(info, &latency_list, list) {
-                if (info->usecs < min)
-                        min = info->usecs;
-        }
-        return min;
-}
-/**
- * set_acceptable_latency - sets the maximum latency acceptable
- * @identifier: string that identifies this driver
- * @usecs: maximum acceptable latency for this driver
- *
- * This function informs the kernel that this device(driver)
- * can accept at most usecs latency. This setting is used for
- * power management and similar tradeoffs.
- *
- * This function sleeps and can only be called from process
- * context.
- * Calling this function with an existing identifier is valid
- * and will cause the existing latency setting to be changed.
- */
-void set_acceptable_latency(char *identifier, int usecs)
-{
-        struct latency_info *info, *iter;
-        unsigned long flags;
-        int found_old = 0;
-        info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
-        if (!info)
-                return;
-        info->usecs = usecs;
-        info->identifier = kstrdup(identifier, GFP_KERNEL);
-        if (!info->identifier)
-                goto free_info;
-        spin_lock_irqsave(&latency_lock, flags);
-        list_for_each_entry(iter, &latency_list, list) {
-                if (strcmp(iter->identifier, identifier)==0) {
-                        found_old = 1;
-                        iter->usecs = usecs;
-                        break;
-                }
-        }
-        if (!found_old)
-                list_add(&info->list, &latency_list);
-        if (usecs < atomic_read(&current_max_latency))
-                atomic_set(&current_max_latency, usecs);
-        spin_unlock_irqrestore(&latency_lock, flags);
-        blocking_notifier_call_chain(&latency_notifier,
-                atomic_read(&current_max_latency), NULL);
-        /*
-         * if we inserted the new one, we're done; otherwise there was
-         * an existing one so we need to free the redundant data
-         */
-        if (!found_old)
-                return;
-        kfree(info->identifier);
-free_info:
-        kfree(info);
-}
-EXPORT_SYMBOL_GPL(set_acceptable_latency);
-/**
- * modify_acceptable_latency - changes the maximum latency acceptable
- * @identifier: string that identifies this driver
- * @usecs: maximum acceptable latency for this driver
- *
- * This function informs the kernel that this device(driver)
- * can accept at most usecs latency. This setting is used for
- * power management and similar tradeoffs.
- *
- * This function does not sleep and can be called in any context.
- * Trying to use a non-existing identifier silently gets ignored.
- *
- * Due to the atomic nature of this function, the modified latency
- * value will only be used for future decisions; past decisions
- * can still lead to longer latencies in the near future.
- */
-void modify_acceptable_latency(char *identifier, int usecs)
-{
-        struct latency_info *iter;
-        unsigned long flags;
-        spin_lock_irqsave(&latency_lock, flags);
-        list_for_each_entry(iter, &latency_list, list) {
-                if (strcmp(iter->identifier, identifier) == 0) {
-                        iter->usecs = usecs;
-                        break;
-                }
-        }
-        if (usecs < atomic_read(&current_max_latency))
-                atomic_set(&current_max_latency, usecs);
-        spin_unlock_irqrestore(&latency_lock, flags);
-}
-EXPORT_SYMBOL_GPL(modify_acceptable_latency);
-/**
- * remove_acceptable_latency - removes the maximum latency acceptable
- * @identifier: string that identifies this driver
- *
- * This function removes a previously set maximum latency setting
- * for the driver and frees up any resources associated with the
- * bookkeeping needed for this.
- *
- * This function does not sleep and can be called in any context.
- * Trying to use a non-existing identifier silently gets ignored.
- */
-void remove_acceptable_latency(char *identifier)
-{
-        unsigned long flags;
-        int newmax = 0;
-        struct latency_info *iter, *temp;
-        spin_lock_irqsave(&latency_lock, flags);
-        list_for_each_entry_safe(iter,  temp, &latency_list, list) {
-                if (strcmp(iter->identifier, identifier) == 0) {
-                        list_del(&iter->list);
-                        newmax = iter->usecs;
-                        kfree(iter->identifier);
-                        kfree(iter);
-                        break;
-                }
-        }
-        /* If we just deleted the system wide value, we need to
-         * recalculate with a full search
-         */
-        if (newmax == atomic_read(&current_max_latency)) {
-                newmax = __find_max_latency();
-                atomic_set(&current_max_latency, newmax);
-        }
-        spin_unlock_irqrestore(&latency_lock, flags);
-}
-EXPORT_SYMBOL_GPL(remove_acceptable_latency);
-/**
- * system_latency_constraint - queries the system wide latency maximum
- *
- * This function returns the system wide maximum latency in
- * microseconds.
- *
- * This function does not sleep and can be called in any context.
- */
-int system_latency_constraint(void)
-{
-        return atomic_read(&current_max_latency);
-}
-EXPORT_SYMBOL_GPL(system_latency_constraint);
-/**
- * synchronize_acceptable_latency - recalculates all latency decisions
- *
- * This function will cause a callback to various kernel pieces that
- * will make those pieces rethink their latency decisions. This implies
- * that if there are overlong latencies in hardware state already, those
- * latencies get taken right now. When this call completes no overlong
- * latency decisions should be active anymore.
- *
- * Typical usecase of this is after a modify_acceptable_latency() call,
- * which in itself is non-blocking and non-synchronizing.
- *
- * This function blocks and should not be called with locks held.
- */
-void synchronize_acceptable_latency(void)
-{
-        blocking_notifier_call_chain(&latency_notifier,
-                atomic_read(&current_max_latency), NULL);
-}
-EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
-/*
- * Latency notifier: this notifier gets called when a non-atomic new
- * latency value gets set. The expectation nof the caller of the
- * non-atomic set is that when the call returns, future latencies
- * are within bounds, so the functions on the notifier list are
- * expected to take the overlong latencies immediately, inside the
- * callback, and not make a overlong latency decision anymore.
- *
- * The callback gets called when the new latency value is made
- * active so system_latency_constraint() returns the new latency.
- */
-int register_latency_notifier(struct notifier_block * nb)
-{
-        return blocking_notifier_chain_register(&latency_notifier, nb);
-}
-EXPORT_SYMBOL_GPL(register_latency_notifier);
-int unregister_latency_notifier(struct notifier_block * nb)
-{
-        return blocking_notifier_chain_unregister(&latency_notifier, nb);
-}
-EXPORT_SYMBOL_GPL(unregister_latency_notifier);
-static __init int latency_init(void)
-{
-        atomic_set(&current_max_latency, INFINITE_LATENCY);
-        /*
-         * we don't want by default to have longer latencies than 2 ticks,
-         * since that would cause lost ticks
-         */
-        set_acceptable_latency("kernel", 2*1000000/HZ);
-        return 0;
-}
-module_init(latency_init);
diff --git a/kernel/marker.c b/kernel/marker.c
index 5323cfaedbce..c4c2cd8b61f5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -27,35 +27,42 @@
 extern struct marker __start___markers[];
 extern struct marker __stop___markers[];
+/* Set to 1 to enable marker debug output */
+const int marker_debug;
 /*
 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers, the hash table and deferred_sync.
+ * and module markers and the hash table.
 */
 static DEFINE_MUTEX(markers_mutex);
 /*
- * Marker deferred synchronization.
- * Upon marker probe_unregister, we delay call to synchronize_sched() to
- * accelerate mass unregistration (only when there is no more reference to a
- * given module do we call synchronize_sched()). However, we need to make sure
- * every critical region has ended before we re-arm a marker that has been
- * unregistered and then registered back with a different probe data.
- */
-static int deferred_sync;
-/*
 * Marker hash table, containing the active markers.
 * Protected by module_mutex.
 */
 #define MARKER_HASH_BITS 6
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker.  It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
 struct marker_entry {
        struct hlist_node hlist;
        char *format;
-        marker_probe_func *probe;
+        void (*call)(const struct marker *mdata,        /* Probe wrapper */
-        void *private;
+                void *call_private, const char *fmt, ...);
+        struct marker_probe_closure single;
+        struct marker_probe_closure *multi;
        int refcount;   /* Number of times armed. 0 if disarmed. */
+        struct rcu_head rcu;
+        void *oldptr;
+        char rcu_pending:1;
+        char ptype:1;
        char name[0];   /* Contains name'\0'format'\0' */
 };
@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 /**
 * __mark_empty_function - Empty probe callback
- * @mdata: pointer of type const struct marker
+ * @probe_private: probe private data
+ * @call_private: call site private data
 * @fmt: format string
 * @...: variable argument list
 *
@@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 * though the function pointer change and the marker enabling are two distinct
 * operations that modifies the execution flow of preemptible code.
 */
-void __mark_empty_function(const struct marker *mdata, void *private,
+void __mark_empty_function(void *probe_private, void *call_private,
-        const char *fmt, ...)
+        const char *fmt, va_list *args)
 {
 }
 EXPORT_SYMBOL_GPL(__mark_empty_function);
 /*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...:  Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+void marker_probe_cb(const struct marker *mdata, void *call_private,
+        const char *fmt, ...)
+{
+        va_list args;
+        char ptype;
+        /*
+         * disabling preemption to make sure the teardown of the callbacks can
+         * be done correctly when they are in modules and they insure RCU read
+         * coherency.
+         */
+        preempt_disable();
+        ptype = ACCESS_ONCE(mdata->ptype);
+        if (likely(!ptype)) {
+                marker_probe_func *func;
+                /* Must read the ptype before ptr. They are not data dependant,
+                 * so we put an explicit smp_rmb() here. */
+                smp_rmb();
+                func = ACCESS_ONCE(mdata->single.func);
+                /* Must read the ptr before private data. They are not data
+                 * dependant, so we put an explicit smp_rmb() here. */
+                smp_rmb();
+                va_start(args, fmt);
+                func(mdata->single.probe_private, call_private, fmt, &args);
+                va_end(args);
+        } else {
+                struct marker_probe_closure *multi;
+                int i;
+                /*
+                 * multi points to an array, therefore accessing the array
+                 * depends on reading multi. However, even in this case,
+                 * we must insure that the pointer is read _before_ the array
+                 * data. Same as rcu_dereference, but we need a full smp_rmb()
+                 * in the fast path, so put the explicit barrier here.
+                 */
+                smp_read_barrier_depends();
+                multi = ACCESS_ONCE(mdata->multi);
+                for (i = 0; multi[i].func; i++) {
+                        va_start(args, fmt);
+                        multi[i].func(multi[i].probe_private, call_private, fmt,
+                                &args);
+                        va_end(args);
+                }
+        }
+        preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...:  Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+void marker_probe_cb_noarg(const struct marker *mdata,
+        void *call_private, const char *fmt, ...)
+{
+        va_list args;   /* not initialized */
+        char ptype;
+        preempt_disable();
+        ptype = ACCESS_ONCE(mdata->ptype);
+        if (likely(!ptype)) {
+                marker_probe_func *func;
+                /* Must read the ptype before ptr. They are not data dependant,
+                 * so we put an explicit smp_rmb() here. */
+                smp_rmb();
+                func = ACCESS_ONCE(mdata->single.func);
+                /* Must read the ptr before private data. They are not data
+                 * dependant, so we put an explicit smp_rmb() here. */
+                smp_rmb();
+                func(mdata->single.probe_private, call_private, fmt, &args);
+        } else {
+                struct marker_probe_closure *multi;
+                int i;
+                /*
+                 * multi points to an array, therefore accessing the array
+                 * depends on reading multi. However, even in this case,
+                 * we must insure that the pointer is read _before_ the array
+                 * data. Same as rcu_dereference, but we need a full smp_rmb()
+                 * in the fast path, so put the explicit barrier here.
+                 */
+                smp_read_barrier_depends();
+                multi = ACCESS_ONCE(mdata->multi);
+                for (i = 0; multi[i].func; i++)
+                        multi[i].func(multi[i].probe_private, call_private, fmt,
+                                &args);
+        }
+        preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
+static void free_old_closure(struct rcu_head *head)
+{
+        struct marker_entry *entry = container_of(head,
+                struct marker_entry, rcu);
+        kfree(entry->oldptr);
+        /* Make sure we free the data before setting the pending flag to 0 */
+        smp_wmb();
+        entry->rcu_pending = 0;
+}
+static void debug_print_probes(struct marker_entry *entry)
+{
+        int i;
+        if (!marker_debug)
+                return;
+        if (!entry->ptype) {
+                printk(KERN_DEBUG "Single probe : %p %p\n",
+                        entry->single.func,
+                        entry->single.probe_private);
+        } else {
+                for (i = 0; entry->multi[i].func; i++)
+                        printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+                                entry->multi[i].func,
+                                entry->multi[i].probe_private);
+        }
+}
+static struct marker_probe_closure *
+marker_entry_add_probe(struct marker_entry *entry,
+                marker_probe_func *probe, void *probe_private)
+{
+        int nr_probes = 0;
+        struct marker_probe_closure *old, *new;
+        WARN_ON(!probe);
+        debug_print_probes(entry);
+        old = entry->multi;
+        if (!entry->ptype) {
+                if (entry->single.func == probe &&
+                                entry->single.probe_private == probe_private)
+                        return ERR_PTR(-EBUSY);
+                if (entry->single.func == __mark_empty_function) {
+                        /* 0 -> 1 probes */
+                        entry->single.func = probe;
+                        entry->single.probe_private = probe_private;
+                        entry->refcount = 1;
+                        entry->ptype = 0;
+                        debug_print_probes(entry);
+                        return NULL;
+                } else {
+                        /* 1 -> 2 probes */
+                        nr_probes = 1;
+                        old = NULL;
+                }
+        } else {
+                /* (N -> N+1), (N != 0, 1) probes */
+                for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+                        if (old[nr_probes].func == probe
+                                        && old[nr_probes].probe_private
+                                                == probe_private)
+                                return ERR_PTR(-EBUSY);
+        }
+        /* + 2 : one for new probe, one for NULL func */
+        new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
+                        GFP_KERNEL);
+        if (new == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (!old)
+                new[0] = entry->single;
+        else
+                memcpy(new, old,
+                        nr_probes * sizeof(struct marker_probe_closure));
+        new[nr_probes].func = probe;
+        new[nr_probes].probe_private = probe_private;
+        entry->refcount = nr_probes + 1;
+        entry->multi = new;
+        entry->ptype = 1;
+        debug_print_probes(entry);
+        return old;
+}
+static struct marker_probe_closure *
+marker_entry_remove_probe(struct marker_entry *entry,
+                marker_probe_func *probe, void *probe_private)
+{
+        int nr_probes = 0, nr_del = 0, i;
+        struct marker_probe_closure *old, *new;
+        old = entry->multi;
+        debug_print_probes(entry);
+        if (!entry->ptype) {
+                /* 0 -> N is an error */
+                WARN_ON(entry->single.func == __mark_empty_function);
+                /* 1 -> 0 probes */
+                WARN_ON(probe && entry->single.func != probe);
+                WARN_ON(entry->single.probe_private != probe_private);
+                entry->single.func = __mark_empty_function;
+                entry->refcount = 0;
+                entry->ptype = 0;
+                debug_print_probes(entry);
+                return NULL;
+        } else {
+                /* (N -> M), (N > 1, M >= 0) probes */
+                for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+                        if ((!probe || old[nr_probes].func == probe)
+                                        && old[nr_probes].probe_private
+                                                == probe_private)
+                                nr_del++;
+                }
+        }
+        if (nr_probes - nr_del == 0) {
+                /* N -> 0, (N > 1) */
+                entry->single.func = __mark_empty_function;
+                entry->refcount = 0;
+                entry->ptype = 0;
+        } else if (nr_probes - nr_del == 1) {
+                /* N -> 1, (N > 1) */
+                for (i = 0; old[i].func; i++)
+                        if ((probe && old[i].func != probe) ||
+                                        old[i].probe_private != probe_private)
+                                entry->single = old[i];
+                entry->refcount = 1;
+                entry->ptype = 0;
+        } else {
+                int j = 0;
+                /* N -> M, (N > 1, M > 1) */
+                /* + 1 for NULL */
+                new = kzalloc((nr_probes - nr_del + 1)
+                        * sizeof(struct marker_probe_closure), GFP_KERNEL);
+                if (new == NULL)
+                        return ERR_PTR(-ENOMEM);
+                for (i = 0; old[i].func; i++)
+                        if ((probe && old[i].func != probe) ||
+                                        old[i].probe_private != probe_private)
+                                new[j++] = old[i];
+                entry->refcount = nr_probes - nr_del;
+                entry->ptype = 1;
+                entry->multi = new;
+        }
+        debug_print_probes(entry);
+        return old;
+}
+/*
 * Get marker if the marker is present in the marker hash table.
 * Must be called with markers_mutex held.
 * Returns NULL if not present.
@@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name)
 * Add the marker to the marker hash table. Must be called with markers_mutex
 * held.
 */
-static int add_marker(const char *name, const char *format,
+static struct marker_entry *add_marker(const char *name, const char *format)
-        marker_probe_func *probe, void *private)
 {
        struct hlist_head *head;
        struct hlist_node *node;
@@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format,
        hlist_for_each_entry(e, node, head, hlist) {
                if (!strcmp(name, e->name)) {
                        printk(KERN_NOTICE
-                                "Marker %s busy, probe %p already installed\n",
+                                "Marker %s busy\n", name);
-                                name, e->probe);
+                        return ERR_PTR(-EBUSY); /* Already there */
-                        return -EBUSY;  /* Already there */
                }
        }
        /*
@@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format,
        e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
                        GFP_KERNEL);
        if (!e)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        memcpy(&e->name[0], name, name_len);
        if (format) {
                e->format = &e->name[name_len];
                memcpy(e->format, format, format_len);
+                if (strcmp(e->format, MARK_NOARGS) == 0)
+                        e->call = marker_probe_cb_noarg;
+                else
+                        e->call = marker_probe_cb;
                trace_mark(core_marker_format, "name %s format %s",
                                e->name, e->format);
-        } else
+        } else {
                e->format = NULL;
-        e->probe = probe;
+                e->call = marker_probe_cb;
-        e->private = private;
+        }
+        e->single.func = __mark_empty_function;
+        e->single.probe_private = NULL;
+        e->multi = NULL;
+        e->ptype = 0;
        e->refcount = 0;
+        e->rcu_pending = 0;
        hlist_add_head(&e->hlist, head);
-        return 0;
+        return e;
 }
 /*
 * Remove the marker from the marker hash table. Must be called with mutex_lock
 * held.
 */
-static void *remove_marker(const char *name)
+static int remove_marker(const char *name)
 {
        struct hlist_head *head;
        struct hlist_node *node;
        struct marker_entry *e;
        int found = 0;
        size_t len = strlen(name) + 1;
-        void *private = NULL;
        u32 hash = jhash(name, len-1, 0);
        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +435,16 @@ static void *remove_marker(const char *name)
                        break;
                }
        }
-        if (found) {
+        if (!found)
-                private = e->private;
+                return -ENOENT;
-                hlist_del(&e->hlist);
+        if (e->single.func != __mark_empty_function)
-                kfree(e);
+                return -EBUSY;
-        }
+        hlist_del(&e->hlist);
-        return private;
+        /* Make sure the call_rcu has been executed */
+        if (e->rcu_pending)
+                rcu_barrier();
+        kfree(e);
+        return 0;
 }
 /*
@@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
        size_t name_len = strlen((*entry)->name) + 1;
        size_t format_len = strlen(format) + 1;
        e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
                        GFP_KERNEL);
        if (!e)
@@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
        memcpy(&e->name[0], (*entry)->name, name_len);
        e->format = &e->name[name_len];
        memcpy(e->format, format, format_len);
-        e->probe = (*entry)->probe;
+        if (strcmp(e->format, MARK_NOARGS) == 0)
-        e->private = (*entry)->private;
+                e->call = marker_probe_cb_noarg;
+        else
+                e->call = marker_probe_cb;
+        e->single = (*entry)->single;
+        e->multi = (*entry)->multi;
+        e->ptype = (*entry)->ptype;
        e->refcount = (*entry)->refcount;
+        e->rcu_pending = 0;
        hlist_add_before(&e->hlist, &(*entry)->hlist);
        hlist_del(&(*entry)->hlist);
+        /* Make sure the call_rcu has been executed */
+        if ((*entry)->rcu_pending)
+                rcu_barrier();
        kfree(*entry);
        *entry = e;
        trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 /*
 * Sets the probe callback corresponding to one marker.
 */
-static int set_marker(struct marker_entry **entry, struct marker *elem)
+static int set_marker(struct marker_entry **entry, struct marker *elem,
+                int active)
 {
        int ret;
        WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
                if (ret)
                        return ret;
        }
-        elem->call = (*entry)->probe;
-        elem->private = (*entry)->private;
+        /*
-        elem->state = 1;
+         * probe_cb setup (statically known) is done here. It is
+         * asynchronous with the rest of execution, therefore we only
+         * pass from a "safe" callback (with argument) to an "unsafe"
+         * callback (does not set arguments).
+         */
+        elem->call = (*entry)->call;
+        /*
+         * Sanity check :
+         * We only update the single probe private data when the ptr is
+         * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+         */
+        WARN_ON(elem->single.func != __mark_empty_function
+                && elem->single.probe_private
+                != (*entry)->single.probe_private &&
+                !elem->ptype);
+        elem->single.probe_private = (*entry)->single.probe_private;
+        /*
+         * Make sure the private data is valid when we update the
+         * single probe ptr.
+         */
+        smp_wmb();
+        elem->single.func = (*entry)->single.func;
+        /*
+         * We also make sure that the new probe callbacks array is consistent
+         * before setting a pointer to it.
+         */
+        rcu_assign_pointer(elem->multi, (*entry)->multi);
+        /*
+         * Update the function or multi probe array pointer before setting the
+         * ptype.
+         */
+        smp_wmb();
+        elem->ptype = (*entry)->ptype;
+        elem->state = active;
        return 0;
 }
@@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
 */
 static void disable_marker(struct marker *elem)
 {
+        /* leave "call" as is. It is known statically. */
        elem->state = 0;
-        elem->call = __mark_empty_function;
+        elem->single.func = __mark_empty_function;
+        /* Update the function before setting the ptype */
+        smp_wmb();
+        elem->ptype = 0;        /* single probe */
        /*
         * Leave the private data and id there, because removal is racy and
         * should be done only after a synchronize_sched(). These are never used
@@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem)
 * marker_update_probe_range - Update a probe range
 * @begin: beginning of the range
 * @end: end of the range
- * @probe_module: module address of the probe being updated
- * @refcount: number of references left to the given probe_module (out)
 *
 * Updates the probe callback corresponding to a range of markers.
 */
 void marker_update_probe_range(struct marker *begin,
-        struct marker *end, struct module *probe_module,
+        struct marker *end)
-        int *refcount)
 {
        struct marker *iter;
        struct marker_entry *mark_entry;
@@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin,
        mutex_lock(&markers_mutex);
        for (iter = begin; iter < end; iter++) {
                mark_entry = get_marker(iter->name);
-                if (mark_entry && mark_entry->refcount) {
+                if (mark_entry) {
-                        set_marker(&mark_entry, iter);
+                        set_marker(&mark_entry, iter,
+                                        !!mark_entry->refcount);
                        /*
                         * ignore error, continue
                         */
-                        if (probe_module)
-                                if (probe_module ==
-                        __module_text_address((unsigned long)mark_entry->probe))
-                                        (*refcount)++;
                } else {
                        disable_marker(iter);
                }
@@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin,
 * Issues a synchronize_sched() when no reference to the module passed
 * as parameter is found in the probes so the probe module can be
 * safely unloaded from now on.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions.  All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
 */
-static void marker_update_probes(struct module *probe_module)
+static void marker_update_probes(void)
 {
-        int refcount = 0;
        /* Core kernel markers */
-        marker_update_probe_range(__start___markers,
+        marker_update_probe_range(__start___markers, __stop___markers);
-                        __stop___markers, probe_module, &refcount);
        /* Markers in modules. */
-        module_update_markers(probe_module, &refcount);
+        module_update_markers();
-        if (probe_module && refcount == 0) {
-                synchronize_sched();
-                deferred_sync = 0;
-        }
 }
 /**
@@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module)
 * @name: marker name
 * @format: format string
 * @probe: probe handler
- * @private: probe private data
+ * @probe_private: probe private data
 *
 * private data must be a valid allocated memory address, or NULL.
 * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
 */
 int marker_probe_register(const char *name, const char *format,
-                        marker_probe_func *probe, void *private)
+                        marker_probe_func *probe, void *probe_private)
 {
        struct marker_entry *entry;
        int ret = 0;
+        struct marker_probe_closure *old;
        mutex_lock(&markers_mutex);
        entry = get_marker(name);
-        if (entry && entry->refcount) {
+        if (!entry) {
-                ret = -EBUSY;
+                entry = add_marker(name, format);
-                goto end;
+                if (IS_ERR(entry)) {
-        }
+                        ret = PTR_ERR(entry);
-        if (deferred_sync) {
+                        goto end;
-                synchronize_sched();
+                }
-                deferred_sync = 0;
        }
-        ret = add_marker(name, format, probe, private);
+        /*
-        if (ret)
+         * If we detect that a call_rcu is pending for this marker,
+         * make sure it's executed now.
+         */
+        if (entry->rcu_pending)
+                rcu_barrier();
+        old = marker_entry_add_probe(entry, probe, probe_private);
+        if (IS_ERR(old)) {
+                ret = PTR_ERR(old);
                goto end;
+        }
        mutex_unlock(&markers_mutex);
-        marker_update_probes(NULL);
+        marker_update_probes();         /* may update entry */
-        return ret;
+        mutex_lock(&markers_mutex);
+        entry = get_marker(name);
+        WARN_ON(!entry);
+        entry->oldptr = old;
+        entry->rcu_pending = 1;
+        /* write rcu_pending before calling the RCU callback */
+        smp_wmb();
+        call_rcu(&entry->rcu, free_old_closure);
 end:
        mutex_unlock(&markers_mutex);
        return ret;
@@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register);
 /**
 * marker_probe_unregister -  Disconnect a probe from a marker
 * @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
 *
 * Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
 */
-void *marker_probe_unregister(const char *name)
+int marker_probe_unregister(const char *name,
+        marker_probe_func *probe, void *probe_private)
 {
-        struct module *probe_module;
        struct marker_entry *entry;
-        void *private;
+        struct marker_probe_closure *old;
+        int ret = 0;
        mutex_lock(&markers_mutex);
        entry = get_marker(name);
        if (!entry) {
-                private = ERR_PTR(-ENOENT);
+                ret = -ENOENT;
                goto end;
        }
-        entry->refcount = 0;
+        if (entry->rcu_pending)
-        /* In what module is the probe handler ? */
+                rcu_barrier();
-        probe_module = __module_text_address((unsigned long)entry->probe);
+        old = marker_entry_remove_probe(entry, probe, probe_private);
-        private = remove_marker(name);
-        deferred_sync = 1;
        mutex_unlock(&markers_mutex);
-        marker_update_probes(probe_module);
+        marker_update_probes();         /* may update entry */
-        return private;
+        mutex_lock(&markers_mutex);
+        entry = get_marker(name);
+        entry->oldptr = old;
+        entry->rcu_pending = 1;
+        /* write rcu_pending before calling the RCU callback */
+        smp_wmb();
+        call_rcu(&entry->rcu, free_old_closure);
+        remove_marker(name);    /* Ignore busy error message */
 end:
        mutex_unlock(&markers_mutex);
-        return private;
+        return ret;
 }
 EXPORT_SYMBOL_GPL(marker_probe_unregister);
-/**
+static struct marker_entry *
- * marker_probe_unregister_private_data -  Disconnect a probe from a marker
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
- * @private: probe private data
- *
- * Unregister a marker by providing the registered private data.
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- */
-void *marker_probe_unregister_private_data(void *private)
 {
-        struct module *probe_module;
-        struct hlist_head *head;
-        struct hlist_node *node;
        struct marker_entry *entry;
-        int found = 0;
        unsigned int i;
+        struct hlist_head *head;
+        struct hlist_node *node;
-        mutex_lock(&markers_mutex);
        for (i = 0; i < MARKER_TABLE_SIZE; i++) {
                head = &marker_table[i];
                hlist_for_each_entry(entry, node, head, hlist) {
-                        if (entry->private == private) {
+                        if (!entry->ptype) {
-                                found = 1;
+                                if (entry->single.func == probe
-                                goto iter_end;
+                                                && entry->single.probe_private
+                                                == probe_private)
+                                        return entry;
+                        } else {
+                                struct marker_probe_closure *closure;
+                                closure = entry->multi;
+                                for (i = 0; closure[i].func; i++) {
+                                        if (closure[i].func == probe &&
+                                                        closure[i].probe_private
+                                                        == probe_private)
+                                                return entry;
+                                }
                        }
                }
        }
-iter_end:
+        return NULL;
-        if (!found) {
-                private = ERR_PTR(-ENOENT);
-                goto end;
-        }
-        entry->refcount = 0;
-        /* In what module is the probe handler ? */
-        probe_module = __module_text_address((unsigned long)entry->probe);
-        private = remove_marker(entry->name);
-        deferred_sync = 1;
-        mutex_unlock(&markers_mutex);
-        marker_update_probes(probe_module);
-        return private;
-end:
-        mutex_unlock(&markers_mutex);
-        return private;
 }
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
 /**
- * marker_arm - Arm a marker
+ * marker_probe_unregister_private_data -  Disconnect a probe from a marker
- * @name: marker name
+ * @probe: probe function
+ * @probe_private: probe private data
 *
- * Activate a marker. It keeps a reference count of the number of
+ * Unregister a probe by providing the registered private data.
- * arming/disarming done.
+ * Only removes the first marker found in hash table.
- * Returns 0 if ok, error value on error.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
 */
-int marker_arm(const char *name)
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+                void *probe_private)
 {
        struct marker_entry *entry;
        int ret = 0;
+        struct marker_probe_closure *old;
        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
+        entry = get_marker_from_private_data(probe, probe_private);
        if (!entry) {
                ret = -ENOENT;
                goto end;
        }
-        /*
+        if (entry->rcu_pending)
-         * Only need to update probes when refcount passes from 0 to 1.
+                rcu_barrier();
-         */
+        old = marker_entry_remove_probe(entry, NULL, probe_private);
-        if (entry->refcount++)
-                goto end;
-end:
        mutex_unlock(&markers_mutex);
-        marker_update_probes(NULL);
+        marker_update_probes();         /* may update entry */
-        return ret;
-}
-EXPORT_SYMBOL_GPL(marker_arm);
-/**
- * marker_disarm - Disarm a marker
- * @name: marker name
- *
- * Disarm a marker. It keeps a reference count of the number of arming/disarming
- * done.
- * Returns 0 if ok, error value on error.
- */
-int marker_disarm(const char *name)
-{
-        struct marker_entry *entry;
-        int ret = 0;
        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
+        entry = get_marker_from_private_data(probe, probe_private);
-        if (!entry) {
+        WARN_ON(!entry);
-                ret = -ENOENT;
+        entry->oldptr = old;
-                goto end;
+        entry->rcu_pending = 1;
-        }
+        /* write rcu_pending before calling the RCU callback */
-        /*
+        smp_wmb();
-         * Only permit decrement refcount if higher than 0.
+        call_rcu(&entry->rcu, free_old_closure);
-         * Do probe update only on 1 -> 0 transition.
+        remove_marker(entry->name);     /* Ignore busy error message */
-         */
-        if (entry->refcount) {
-                if (--entry->refcount)
-                        goto end;
-        } else {
-                ret = -EPERM;
-                goto end;
-        }
 end:
        mutex_unlock(&markers_mutex);
-        marker_update_probes(NULL);
        return ret;
 }
-EXPORT_SYMBOL_GPL(marker_disarm);
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
 /**
 * marker_get_private_data - Get a marker's probe private data
 * @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
 *
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
 * Returns the private data pointer, or an ERR_PTR.
 * The private data pointer should _only_ be dereferenced if the caller is the
 * owner of the data, or its content could vanish. This is mostly used to
 * confirm that a caller is the owner of a registered probe.
 */
-void *marker_get_private_data(const char *name)
+void *marker_get_private_data(const char *name, marker_probe_func *probe,
+                int num)
 {
        struct hlist_head *head;
        struct hlist_node *node;
        struct marker_entry *e;
        size_t name_len = strlen(name) + 1;
        u32 hash = jhash(name, name_len-1, 0);
-        int found = 0;
+        int i;
        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
        hlist_for_each_entry(e, node, head, hlist) {
                if (!strcmp(name, e->name)) {
-                        found = 1;
+                        if (!e->ptype) {
-                        return e->private;
+                                if (num == 0 && e->single.func == probe)
+                                        return e->single.probe_private;
+                                else
+                                        break;
+                        } else {
+                                struct marker_probe_closure *closure;
+                                int match = 0;
+                                closure = e->multi;
+                                for (i = 0; closure[i].func; i++) {
+                                        if (closure[i].func != probe)
+                                                continue;
+                                        if (match++ == num)
+                                                return closure[i].probe_private;
+                                }
+                        }
                }
        }
        return ERR_PTR(-ENOENT);
diff --git a/kernel/module.c b/kernel/module.c
index bd60278ee703..92595bad3812 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,6 +46,7 @@
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
 #include <linux/license.h>
+#include <asm/sections.h>
 #if 0
 #define DEBUGP printk
@@ -290,7 +291,7 @@ static unsigned long __find_symbol(const char *name,
                }
        }
        DEBUGP("Failed to find symbol %s\n", name);
-        return 0;
+        return -ENOENT;
 }
 /* Search for module by name: must hold module_mutex. */
@@ -343,9 +344,6 @@ static inline unsigned int block_size(int val)
        return val;
 }
-/* Created by linker magic */
-extern char __per_cpu_start[], __per_cpu_end[];
 static void *percpu_modalloc(unsigned long size, unsigned long align,
                             const char *name)
 {
@@ -783,7 +781,7 @@ void __symbol_put(const char *symbol)
        const unsigned long *crc;
        preempt_disable();
-        if (!__find_symbol(symbol, &owner, &crc, 1))
+        if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1)))
                BUG();
        module_put(owner);
        preempt_enable();
@@ -929,7 +927,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
        const unsigned long *crc;
        struct module *owner;
-        if (!__find_symbol("struct_module", &owner, &crc, 1))
+        if (IS_ERR_VALUE(__find_symbol("struct_module",
+                                                &owner, &crc, 1)))
                BUG();
        return check_version(sechdrs, versindex, "struct_module", mod,
                             crc);
@@ -978,12 +977,12 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
        ret = __find_symbol(name, &owner, &crc,
                        !(mod->taints & TAINT_PROPRIETARY_MODULE));
-        if (ret) {
+        if (!IS_ERR_VALUE(ret)) {
                /* use_module can fail due to OOM,
                   or module initialization or unloading */
                if (!check_version(sechdrs, versindex, name, mod, crc) ||
                    !use_module(mod, owner))
-                        ret = 0;
+                        ret = -EINVAL;
        }
        return ret;
 }
@@ -1371,7 +1370,9 @@ void *__symbol_get(const char *symbol)
        preempt_disable();
        value = __find_symbol(symbol, &owner, &crc, 1);
-        if (value && strong_try_module_get(owner) != 0)
+        if (IS_ERR_VALUE(value))
+                value = 0;
+        else if (strong_try_module_get(owner))
                value = 0;
        preempt_enable();
@@ -1391,14 +1392,16 @@ static int verify_export_symbols(struct module *mod)
        const unsigned long *crc;
        for (i = 0; i < mod->num_syms; i++)
-                if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
+                if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name,
+                                                        &owner, &crc, 1))) {
                        name = mod->syms[i].name;
                        ret = -ENOEXEC;
                        goto dup;
                }
        for (i = 0; i < mod->num_gpl_syms; i++)
-                if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
+                if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name,
+                                                        &owner, &crc, 1))) {
                        name = mod->gpl_syms[i].name;
                        ret = -ENOEXEC;
                        goto dup;
@@ -1448,7 +1451,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
                                           strtab + sym[i].st_name, mod);
                        /* Ok if resolved.  */
-                        if (sym[i].st_value != 0)
+                        if (!IS_ERR_VALUE(sym[i].st_value))
                                break;
                        /* Ok if weak.  */
                        if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
@@ -2035,7 +2038,7 @@ static struct module *load_module(void __user *umod,
 #ifdef CONFIG_MARKERS
        if (!mod->taints)
                marker_update_probe_range(mod->markers,
-                        mod->markers + mod->num_markers, NULL, NULL);
+                        mod->markers + mod->num_markers);
 #endif
        err = module_finalize(hdr, sechdrs, mod);
        if (err < 0)
@@ -2250,7 +2253,7 @@ static const char *get_ksymbol(struct module *mod,
 /* For kallsyms to ask for address resolution.  NULL means not found.  Careful
 * not to lock to avoid deadlock on oopses, simply disable preemption. */
-char *module_address_lookup(unsigned long addr,
+const char *module_address_lookup(unsigned long addr,
                            unsigned long *size,
                            unsigned long *offset,
                            char **modname,
@@ -2275,7 +2278,7 @@ char *module_address_lookup(unsigned long addr,
                ret = namebuf;
        }
        preempt_enable();
-        return (char *)ret;
+        return ret;
 }
 int lookup_module_symbol_name(unsigned long addr, char *symname)
@@ -2561,7 +2564,7 @@ EXPORT_SYMBOL(struct_module);
 #endif
 #ifdef CONFIG_MARKERS
-void module_update_markers(struct module *probe_module, int *refcount)
+void module_update_markers(void)
 {
        struct module *mod;
@@ -2569,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount)
        list_for_each_entry(mod, &modules, list)
                if (!mod->taints)
                        marker_update_probe_range(mod->markers,
-                                mod->markers + mod->num_markers,
+                                mod->markers + mod->num_markers);
-                                probe_module, refcount);
        mutex_unlock(&module_mutex);
 }
 #endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index d17436cdea1b..3aaa06c561de 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -107,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
 * use of the mutex is forbidden. The mutex must not be locked when
 * this function is called.
 */
-void fastcall mutex_destroy(struct mutex *lock)
+void mutex_destroy(struct mutex *lock)
 {
        DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
        lock->magic = NULL;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d9ec9b666250..d046a345d365 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init);
 * We also put the fastpath first in the kernel image, to make sure the
 * branch is predicted by the CPU as default-untaken.
 */
-static void fastcall noinline __sched
+static void noinline __sched
 __mutex_lock_slowpath(atomic_t *lock_count);
 /***
@@ -82,7 +82,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
 *
 * This function is similar to (but not equivalent to) down().
 */
-void inline fastcall __sched mutex_lock(struct mutex *lock)
+void inline __sched mutex_lock(struct mutex *lock)
 {
        might_sleep();
        /*
@@ -95,8 +95,7 @@ void inline fastcall __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
-static void fastcall noinline __sched
+static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
-__mutex_unlock_slowpath(atomic_t *lock_count);
 /***
 * mutex_unlock - release the mutex
@@ -109,7 +108,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count);
 *
 * This function is similar to (but not equivalent to) up().
 */
-void fastcall __sched mutex_unlock(struct mutex *lock)
+void __sched mutex_unlock(struct mutex *lock)
 {
        /*
         * The unlocking fastpath is the 0->1 transition from 'locked'
@@ -234,7 +233,7 @@ EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 /*
 * Release the lock, slowpath:
 */
-static fastcall inline void
+static inline void
 __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -271,7 +270,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 /*
 * Release the lock, slowpath:
 */
-static fastcall noinline void
+static noinline void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
        __mutex_unlock_common_slowpath(lock_count, 1);
@@ -282,10 +281,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
 * Here come the less common (and hence less performance-critical) APIs:
 * mutex_lock_interruptible() and mutex_trylock().
 */
-static int fastcall noinline __sched
+static noinline int __sched
 __mutex_lock_killable_slowpath(atomic_t *lock_count);
-static noinline int fastcall __sched
+static noinline int __sched
 __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
 /***
@@ -299,7 +298,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
 *
 * This function is similar to (but not equivalent to) down_interruptible().
 */
-int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
+int __sched mutex_lock_interruptible(struct mutex *lock)
 {
        might_sleep();
        return __mutex_fastpath_lock_retval
@@ -308,7 +307,7 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock_interruptible);
-int fastcall __sched mutex_lock_killable(struct mutex *lock)
+int __sched mutex_lock_killable(struct mutex *lock)
 {
        might_sleep();
        return __mutex_fastpath_lock_retval
@@ -316,7 +315,7 @@ int fastcall __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
-static void fastcall noinline __sched
+static noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -324,7 +323,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
 }
-static int fastcall noinline __sched
+static noinline int __sched
 __mutex_lock_killable_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -332,7 +331,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
        return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
 }
-static noinline int fastcall __sched
+static noinline int __sched
 __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -381,7 +380,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 * This function must not be used in interrupt context. The
 * mutex must be released by the same task that acquired it.
 */
-int fastcall __sched mutex_trylock(struct mutex *lock)
+int __sched mutex_trylock(struct mutex *lock)
 {
        return __mutex_fastpath_trylock(&lock->count,
                                        __mutex_trylock_slowpath);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4253f472f060..643360d1bb14 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -4,6 +4,7 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
+#include <linux/reboot.h>
 /*
 *      Notifier list for kernel code which wants to be called
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 79f871bc0ef4..f5d332cf8c63 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,7 @@
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
+#include <linux/ipc_namespace.h>
 static struct kmem_cache *nsproxy_cachep;
diff --git a/kernel/panic.c b/kernel/panic.c
index d9e90cfe3298..24af9f8bac99 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -161,7 +161,7 @@ const char *print_tainted(void)
 {
        static char buf[20];
        if (tainted) {
-                snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c",
+                snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c",
                        tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
                        tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
                        tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -169,7 +169,8 @@ const char *print_tainted(void)
                        tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
                        tainted & TAINT_BAD_PAGE ? 'B' : ' ',
                        tainted & TAINT_USER ? 'U' : ' ',
-                        tainted & TAINT_DIE ? 'D' : ' ');
+                        tainted & TAINT_DIE ? 'D' : ' ',
+                        tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ');
        }
        else
                snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/params.c b/kernel/params.c
index 42fe5e6126c0..afc46a23eb6d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -180,12 +180,12 @@ int parse_args(const char *name,
 #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)       \
        int param_set_##name(const char *val, struct kernel_param *kp)  \
        {                                                               \
-                char *endp;                                             \
                tmptype l;                                              \
+                int ret;                                                \
                                                                        \
                if (!val) return -EINVAL;                               \
-                l = strtolfn(val, &endp, 0);                            \
+                ret = strtolfn(val, 0, &l);                             \
-                if (endp == val || ((type)l != l))                      \
+                if (ret == -EINVAL || ((type)l != l))                   \
                        return -EINVAL;                                 \
                *((type *)kp->arg) = l;                                 \
                return 0;                                               \
@@ -195,13 +195,13 @@ int parse_args(const char *name,
                return sprintf(buffer, format, *((type *)kp->arg));     \
        }
-STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol);
+STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol);
+STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol);
+STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
 int param_set_charp(const char *val, struct kernel_param *kp)
 {
@@ -272,7 +272,7 @@ static int param_array(const char *name,
                       unsigned int min, unsigned int max,
                       void *elem, int elemsize,
                       int (*set)(const char *, struct kernel_param *kp),
-                       int *num)
+                       unsigned int *num)
 {
        int ret;
        struct kernel_param kp;
diff --git a/kernel/pid.c b/kernel/pid.c
index f815455431bf..477691576b33 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,7 +41,6 @@
 static struct hlist_head *pid_hash;
 static int pidhash_shift;
 struct pid init_struct_pid = INIT_STRUCT_PID;
-static struct kmem_cache *pid_ns_cachep;
 int pid_max = PID_MAX_DEFAULT;
@@ -112,7 +111,7 @@ EXPORT_SYMBOL(is_container_init);
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
+static void free_pidmap(struct pid_namespace *pid_ns, int pid)
 {
        struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
        int offset = pid & BITS_PER_PAGE_MASK;
@@ -181,7 +180,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
        return -1;
 }
-static int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, int last)
 {
        int offset;
        struct pidmap *map, *end;
@@ -199,7 +198,7 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last)
        return -1;
 }
-fastcall void put_pid(struct pid *pid)
+void put_pid(struct pid *pid)
 {
        struct pid_namespace *ns;
@@ -221,7 +220,7 @@ static void delayed_put_pid(struct rcu_head *rhp)
        put_pid(pid);
 }
-fastcall void free_pid(struct pid *pid)
+void free_pid(struct pid *pid)
 {
        /* We can be called with write_lock_irq(&tasklist_lock) held */
        int i;
@@ -287,7 +286,7 @@ out_free:
        goto out;
 }
-struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns)
+struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
        struct hlist_node *elem;
        struct upid *pnr;
@@ -317,7 +316,7 @@ EXPORT_SYMBOL_GPL(find_pid);
 /*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
-int fastcall attach_pid(struct task_struct *task, enum pid_type type,
+int attach_pid(struct task_struct *task, enum pid_type type,
                struct pid *pid)
 {
        struct pid_link *link;
@@ -329,7 +328,7 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type,
        return 0;
 }
-void fastcall detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct task_struct *task, enum pid_type type)
 {
        struct pid_link *link;
        struct pid *pid;
@@ -349,7 +348,7 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
 }
 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
-void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+void transfer_pid(struct task_struct *old, struct task_struct *new,
                           enum pid_type type)
 {
        new->pids[type].pid = old->pids[type].pid;
@@ -357,7 +356,7 @@ void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
        old->pids[type].pid = NULL;
 }
-struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
+struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 {
        struct task_struct *result = NULL;
        if (pid) {
@@ -368,6 +367,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
        }
        return result;
 }
+EXPORT_SYMBOL(pid_task);
 /*
 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
@@ -408,7 +408,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
        return pid;
 }
-struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
+struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 {
        struct task_struct *result;
        rcu_read_lock();
@@ -443,6 +443,12 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
        return nr;
 }
+pid_t pid_vnr(struct pid *pid)
+{
+        return pid_nr_ns(pid, current->nsproxy->pid_ns);
+}
+EXPORT_SYMBOL_GPL(pid_vnr);
 pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
        return pid_nr_ns(task_pid(tsk), ns);
@@ -487,180 +493,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
-struct pid_cache {
-        int nr_ids;
-        char name[16];
-        struct kmem_cache *cachep;
-        struct list_head list;
-};
-static LIST_HEAD(pid_caches_lh);
-static DEFINE_MUTEX(pid_caches_mutex);
-/*
- * creates the kmem cache to allocate pids from.
- * @nr_ids: the number of numerical ids this pid will have to carry
- */
-static struct kmem_cache *create_pid_cachep(int nr_ids)
-{
-        struct pid_cache *pcache;
-        struct kmem_cache *cachep;
-        mutex_lock(&pid_caches_mutex);
-        list_for_each_entry (pcache, &pid_caches_lh, list)
-                if (pcache->nr_ids == nr_ids)
-                        goto out;
-        pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
-        if (pcache == NULL)
-                goto err_alloc;
-        snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
-        cachep = kmem_cache_create(pcache->name,
-                        sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
-                        0, SLAB_HWCACHE_ALIGN, NULL);
-        if (cachep == NULL)
-                goto err_cachep;
-        pcache->nr_ids = nr_ids;
-        pcache->cachep = cachep;
-        list_add(&pcache->list, &pid_caches_lh);
-out:
-        mutex_unlock(&pid_caches_mutex);
-        return pcache->cachep;
-err_cachep:
-        kfree(pcache);
-err_alloc:
-        mutex_unlock(&pid_caches_mutex);
-        return NULL;
-}
-#ifdef CONFIG_PID_NS
-static struct pid_namespace *create_pid_namespace(int level)
-{
-        struct pid_namespace *ns;
-        int i;
-        ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
-        if (ns == NULL)
-                goto out;
-        ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-        if (!ns->pidmap[0].page)
-                goto out_free;
-        ns->pid_cachep = create_pid_cachep(level + 1);
-        if (ns->pid_cachep == NULL)
-                goto out_free_map;
-        kref_init(&ns->kref);
-        ns->last_pid = 0;
-        ns->child_reaper = NULL;
-        ns->level = level;
-        set_bit(0, ns->pidmap[0].page);
-        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-        for (i = 1; i < PIDMAP_ENTRIES; i++) {
-                ns->pidmap[i].page = 0;
-                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-        }
-        return ns;
-out_free_map:
-        kfree(ns->pidmap[0].page);
-out_free:
-        kmem_cache_free(pid_ns_cachep, ns);
-out:
-        return ERR_PTR(-ENOMEM);
-}
-static void destroy_pid_namespace(struct pid_namespace *ns)
-{
-        int i;
-        for (i = 0; i < PIDMAP_ENTRIES; i++)
-                kfree(ns->pidmap[i].page);
-        kmem_cache_free(pid_ns_cachep, ns);
-}
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
-{
-        struct pid_namespace *new_ns;
-        BUG_ON(!old_ns);
-        new_ns = get_pid_ns(old_ns);
-        if (!(flags & CLONE_NEWPID))
-                goto out;
-        new_ns = ERR_PTR(-EINVAL);
-        if (flags & CLONE_THREAD)
-                goto out_put;
-        new_ns = create_pid_namespace(old_ns->level + 1);
-        if (!IS_ERR(new_ns))
-                new_ns->parent = get_pid_ns(old_ns);
-out_put:
-        put_pid_ns(old_ns);
-out:
-        return new_ns;
-}
-void free_pid_ns(struct kref *kref)
-{
-        struct pid_namespace *ns, *parent;
-        ns = container_of(kref, struct pid_namespace, kref);
-        parent = ns->parent;
-        destroy_pid_namespace(ns);
-        if (parent != NULL)
-                put_pid_ns(parent);
-}
-#endif /* CONFIG_PID_NS */
-void zap_pid_ns_processes(struct pid_namespace *pid_ns)
-{
-        int nr;
-        int rc;
-        /*
-         * The last thread in the cgroup-init thread group is terminating.
-         * Find remaining pid_ts in the namespace, signal and wait for them
-         * to exit.
-         *
-         * Note:  This signals each threads in the namespace - even those that
-         *        belong to the same thread group, To avoid this, we would have
-         *        to walk the entire tasklist looking a processes in this
-         *        namespace, but that could be unnecessarily expensive if the
-         *        pid namespace has just a few processes. Or we need to
-         *        maintain a tasklist for each pid namespace.
-         *
-         */
-        read_lock(&tasklist_lock);
-        nr = next_pidmap(pid_ns, 1);
-        while (nr > 0) {
-                kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
-                nr = next_pidmap(pid_ns, nr);
-        }
-        read_unlock(&tasklist_lock);
-        do {
-                clear_thread_flag(TIF_SIGPENDING);
-                rc = sys_wait4(-1, NULL, __WALL, NULL);
-        } while (rc != -ECHILD);
-        /* Child reaper for the pid namespace is going away */
-        pid_ns->child_reaper = NULL;
-        return;
-}
 /*
 * The pid hash table is scaled according to the amount of memory in the
 * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -693,9 +525,6 @@ void __init pidmap_init(void)
        set_bit(0, init_pid_ns.pidmap[0].page);
        atomic_dec(&init_pid_ns.pidmap[0].nr_free);
-        init_pid_ns.pid_cachep = create_pid_cachep(1);
+        init_pid_ns.pid_cachep = KMEM_CACHE(pid,
-        if (init_pid_ns.pid_cachep == NULL)
+                        SLAB_HWCACHE_ALIGN | SLAB_PANIC);
-                panic("Can't create pid_1 cachep\n");
-        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
new file mode 100644
index 000000000000..6d792b66d854
--- /dev/null
+++ b/kernel/pid_namespace.c
@@ -0,0 +1,197 @@
+/*
+ * Pid namespaces
+ *
+ * Authors:
+ *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
+ *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
+ *     Many thanks to Oleg Nesterov for comments and help
+ *
+ */
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
+#include <linux/err.h>
+#define BITS_PER_PAGE           (PAGE_SIZE*8)
+struct pid_cache {
+        int nr_ids;
+        char name[16];
+        struct kmem_cache *cachep;
+        struct list_head list;
+};
+static LIST_HEAD(pid_caches_lh);
+static DEFINE_MUTEX(pid_caches_mutex);
+static struct kmem_cache *pid_ns_cachep;
+/*
+ * creates the kmem cache to allocate pids from.
+ * @nr_ids: the number of numerical ids this pid will have to carry
+ */
+static struct kmem_cache *create_pid_cachep(int nr_ids)
+{
+        struct pid_cache *pcache;
+        struct kmem_cache *cachep;
+        mutex_lock(&pid_caches_mutex);
+        list_for_each_entry(pcache, &pid_caches_lh, list)
+                if (pcache->nr_ids == nr_ids)
+                        goto out;
+        pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
+        if (pcache == NULL)
+                goto err_alloc;
+        snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
+        cachep = kmem_cache_create(pcache->name,
+                        sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
+                        0, SLAB_HWCACHE_ALIGN, NULL);
+        if (cachep == NULL)
+                goto err_cachep;
+        pcache->nr_ids = nr_ids;
+        pcache->cachep = cachep;
+        list_add(&pcache->list, &pid_caches_lh);
+out:
+        mutex_unlock(&pid_caches_mutex);
+        return pcache->cachep;
+err_cachep:
+        kfree(pcache);
+err_alloc:
+        mutex_unlock(&pid_caches_mutex);
+        return NULL;
+}
+static struct pid_namespace *create_pid_namespace(int level)
+{
+        struct pid_namespace *ns;
+        int i;
+        ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+        if (ns == NULL)
+                goto out;
+        ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!ns->pidmap[0].page)
+                goto out_free;
+        ns->pid_cachep = create_pid_cachep(level + 1);
+        if (ns->pid_cachep == NULL)
+                goto out_free_map;
+        kref_init(&ns->kref);
+        ns->last_pid = 0;
+        ns->child_reaper = NULL;
+        ns->level = level;
+        set_bit(0, ns->pidmap[0].page);
+        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+        for (i = 1; i < PIDMAP_ENTRIES; i++) {
+                ns->pidmap[i].page = 0;
+                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+        }
+        return ns;
+out_free_map:
+        kfree(ns->pidmap[0].page);
+out_free:
+        kmem_cache_free(pid_ns_cachep, ns);
+out:
+        return ERR_PTR(-ENOMEM);
+}
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+        int i;
+        for (i = 0; i < PIDMAP_ENTRIES; i++)
+                kfree(ns->pidmap[i].page);
+        kmem_cache_free(pid_ns_cachep, ns);
+}
+struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+{
+        struct pid_namespace *new_ns;
+        BUG_ON(!old_ns);
+        new_ns = get_pid_ns(old_ns);
+        if (!(flags & CLONE_NEWPID))
+                goto out;
+        new_ns = ERR_PTR(-EINVAL);
+        if (flags & CLONE_THREAD)
+                goto out_put;
+        new_ns = create_pid_namespace(old_ns->level + 1);
+        if (!IS_ERR(new_ns))
+                new_ns->parent = get_pid_ns(old_ns);
+out_put:
+        put_pid_ns(old_ns);
+out:
+        return new_ns;
+}
+void free_pid_ns(struct kref *kref)
+{
+        struct pid_namespace *ns, *parent;
+        ns = container_of(kref, struct pid_namespace, kref);
+        parent = ns->parent;
+        destroy_pid_namespace(ns);
+        if (parent != NULL)
+                put_pid_ns(parent);
+}
+void zap_pid_ns_processes(struct pid_namespace *pid_ns)
+{
+        int nr;
+        int rc;
+        /*
+         * The last thread in the cgroup-init thread group is terminating.
+         * Find remaining pid_ts in the namespace, signal and wait for them
+         * to exit.
+         *
+         * Note:  This signals each threads in the namespace - even those that
+         *        belong to the same thread group, To avoid this, we would have
+         *        to walk the entire tasklist looking a processes in this
+         *        namespace, but that could be unnecessarily expensive if the
+         *        pid namespace has just a few processes. Or we need to
+         *        maintain a tasklist for each pid namespace.
+         *
+         */
+        read_lock(&tasklist_lock);
+        nr = next_pidmap(pid_ns, 1);
+        while (nr > 0) {
+                kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
+                nr = next_pidmap(pid_ns, nr);
+        }
+        read_unlock(&tasklist_lock);
+        do {
+                clear_thread_flag(TIF_SIGPENDING);
+                rc = sys_wait4(-1, NULL, __WALL, NULL);
+        } while (rc != -ECHILD);
+        /* Child reaper for the pid namespace is going away */
+        pid_ns->child_reaper = NULL;
+        return;
+}
+static __init int pid_namespaces_init(void)
+{
+        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+        return 0;
+}
+__initcall(pid_namespaces_init);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
new file mode 100644
index 000000000000..0afe32be4c85
--- /dev/null
+++ b/kernel/pm_qos_params.c
@@ -0,0 +1,425 @@
+/*
+ * This module exposes the interface to kernel space for specifying
+ * QoS dependencies.  It provides infrastructure for registration of:
+ *
+ * Dependents on a QoS value : register requirements
+ * Watchers of QoS value : get notified when target QoS value changes
+ *
+ * This QoS design is best effort based.  Dependents register their QoS needs.
+ * Watchers register to keep track of the current QoS needs of the system.
+ *
+ * There are 3 basic classes of QoS parameter: latency, timeout, throughput
+ * each have defined units:
+ * latency: usec
+ * timeout: usec <-- currently not used.
+ * throughput: kbs (kilo byte / sec)
+ *
+ * There are lists of pm_qos_objects each one wrapping requirements, notifiers
+ *
+ * User mode requirements on a QOS parameter register themselves to the
+ * subsystem by opening the device node /dev/... and writing there request to
+ * the node.  As long as the process holds a file handle open to the node the
+ * client continues to be accounted for.  Upon file release the usermode
+ * requirement is removed and a new qos target is computed.  This way when the
+ * requirement that the application has is cleaned up when closes the file
+ * pointer or exits the pm_qos_object will get an opportunity to clean up.
+ *
+ * mark gross mgross@linux.intel.com
+ */
+#include <linux/pm_qos_params.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/string.h>
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+/*
+ * locking rule: all changes to target_value or requirements or notifiers lists
+ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
+ * held, taken with _irqsave.  One lock to rule them all
+ */
+struct requirement_list {
+        struct list_head list;
+        union {
+                s32 value;
+                s32 usec;
+                s32 kbps;
+        };
+        char *name;
+};
+static s32 max_compare(s32 v1, s32 v2);
+static s32 min_compare(s32 v1, s32 v2);
+struct pm_qos_object {
+        struct requirement_list requirements;
+        struct blocking_notifier_head *notifiers;
+        struct miscdevice pm_qos_power_miscdev;
+        char *name;
+        s32 default_value;
+        s32 target_value;
+        s32 (*comparitor)(s32, s32);
+};
+static struct pm_qos_object null_pm_qos;
+static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
+static struct pm_qos_object cpu_dma_pm_qos = {
+        .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
+        .notifiers = &cpu_dma_lat_notifier,
+        .name = "cpu_dma_latency",
+        .default_value = 2000 * USEC_PER_SEC,
+        .target_value = 2000 * USEC_PER_SEC,
+        .comparitor = min_compare
+};
+static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
+static struct pm_qos_object network_lat_pm_qos = {
+        .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
+        .notifiers = &network_lat_notifier,
+        .name = "network_latency",
+        .default_value = 2000 * USEC_PER_SEC,
+        .target_value = 2000 * USEC_PER_SEC,
+        .comparitor = min_compare
+};
+static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
+static struct pm_qos_object network_throughput_pm_qos = {
+        .requirements =
+                {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
+        .notifiers = &network_throughput_notifier,
+        .name = "network_throughput",
+        .default_value = 0,
+        .target_value = 0,
+        .comparitor = max_compare
+};
+static struct pm_qos_object *pm_qos_array[] = {
+        &null_pm_qos,
+        &cpu_dma_pm_qos,
+        &network_lat_pm_qos,
+        &network_throughput_pm_qos
+};
+static DEFINE_SPINLOCK(pm_qos_lock);
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+                size_t count, loff_t *f_pos);
+static int pm_qos_power_open(struct inode *inode, struct file *filp);
+static int pm_qos_power_release(struct inode *inode, struct file *filp);
+static const struct file_operations pm_qos_power_fops = {
+        .write = pm_qos_power_write,
+        .open = pm_qos_power_open,
+        .release = pm_qos_power_release,
+};
+/* static helper functions */
+static s32 max_compare(s32 v1, s32 v2)
+{
+        return max(v1, v2);
+}
+static s32 min_compare(s32 v1, s32 v2)
+{
+        return min(v1, v2);
+}
+static void update_target(int target)
+{
+        s32 extreme_value;
+        struct requirement_list *node;
+        unsigned long flags;
+        int call_notifier = 0;
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        extreme_value = pm_qos_array[target]->default_value;
+        list_for_each_entry(node,
+                        &pm_qos_array[target]->requirements.list, list) {
+                extreme_value = pm_qos_array[target]->comparitor(
+                                extreme_value, node->value);
+        }
+        if (pm_qos_array[target]->target_value != extreme_value) {
+                call_notifier = 1;
+                pm_qos_array[target]->target_value = extreme_value;
+                pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
+                        pm_qos_array[target]->target_value);
+        }
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        if (call_notifier)
+                blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
+                        (unsigned long) extreme_value, NULL);
+}
+static int register_pm_qos_misc(struct pm_qos_object *qos)
+{
+        qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+        qos->pm_qos_power_miscdev.name = qos->name;
+        qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+        return misc_register(&qos->pm_qos_power_miscdev);
+}
+static int find_pm_qos_object_by_minor(int minor)
+{
+        int pm_qos_class;
+        for (pm_qos_class = 0;
+                pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+                if (minor ==
+                        pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+                        return pm_qos_class;
+        }
+        return -1;
+}
+/**
+ * pm_qos_requirement - returns current system wide qos expectation
+ * @pm_qos_class: identification of which qos value is requested
+ *
+ * This function returns the current target value in an atomic manner.
+ */
+int pm_qos_requirement(int pm_qos_class)
+{
+        int ret_val;
+        unsigned long flags;
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        ret_val = pm_qos_array[pm_qos_class]->target_value;
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        return ret_val;
+}
+EXPORT_SYMBOL_GPL(pm_qos_requirement);
+/**
+ * pm_qos_add_requirement - inserts new qos request into the list
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ * @value: defines the qos request
+ *
+ * This function inserts a new entry in the pm_qos_class list of requested qos
+ * performance charactoistics.  It recomputes the agregate QoS expectations for
+ * the pm_qos_class of parrameters.
+ */
+int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
+{
+        struct requirement_list *dep;
+        unsigned long flags;
+        dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
+        if (dep) {
+                if (value == PM_QOS_DEFAULT_VALUE)
+                        dep->value = pm_qos_array[pm_qos_class]->default_value;
+                else
+                        dep->value = value;
+                dep->name = kstrdup(name, GFP_KERNEL);
+                if (!dep->name)
+                        goto cleanup;
+                spin_lock_irqsave(&pm_qos_lock, flags);
+                list_add(&dep->list,
+                        &pm_qos_array[pm_qos_class]->requirements.list);
+                spin_unlock_irqrestore(&pm_qos_lock, flags);
+                update_target(pm_qos_class);
+                return 0;
+        }
+cleanup:
+        kfree(dep);
+        return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
+/**
+ * pm_qos_update_requirement - modifies an existing qos request
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ * @value: defines the qos request
+ *
+ * Updates an existing qos requierement for the pm_qos_class of parameters along
+ * with updating the target pm_qos_class value.
+ *
+ * If the named request isn't in the lest then no change is made.
+ */
+int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
+{
+        unsigned long flags;
+        struct requirement_list *node;
+        int pending_update = 0;
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        list_for_each_entry(node,
+                &pm_qos_array[pm_qos_class]->requirements.list, list) {
+                if (strcmp(node->name, name) == 0) {
+                        if (new_value == PM_QOS_DEFAULT_VALUE)
+                                node->value =
+                                pm_qos_array[pm_qos_class]->default_value;
+                        else
+                                node->value = new_value;
+                        pending_update = 1;
+                        break;
+                }
+        }
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        if (pending_update)
+                update_target(pm_qos_class);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
+/**
+ * pm_qos_remove_requirement - modifies an existing qos request
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ *
+ * Will remove named qos request from pm_qos_class list of parrameters and
+ * recompute the current target value for the pm_qos_class.
+ */
+void pm_qos_remove_requirement(int pm_qos_class, char *name)
+{
+        unsigned long flags;
+        struct requirement_list *node;
+        int pending_update = 0;
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        list_for_each_entry(node,
+                &pm_qos_array[pm_qos_class]->requirements.list, list) {
+                if (strcmp(node->name, name) == 0) {
+                        kfree(node->name);
+                        list_del(&node->list);
+                        kfree(node);
+                        pending_update = 1;
+                        break;
+                }
+        }
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        if (pending_update)
+                update_target(pm_qos_class);
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
+/**
+ * pm_qos_add_notifier - sets notification entry for changes to target value
+ * @pm_qos_class: identifies which qos target changes should be notified.
+ * @notifier: notifier block managed by caller.
+ *
+ * will register the notifier into a notification chain that gets called
+ * uppon changes to the pm_qos_class target value.
+ */
+ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+        int retval;
+        retval = blocking_notifier_chain_register(
+                        pm_qos_array[pm_qos_class]->notifiers, notifier);
+        return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
+/**
+ * pm_qos_remove_notifier - deletes notification entry from chain.
+ * @pm_qos_class: identifies which qos target changes are notified.
+ * @notifier: notifier block to be removed.
+ *
+ * will remove the notifier from the notification chain that gets called
+ * uppon changes to the pm_qos_class target value.
+ */
+int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+        int retval;
+        retval = blocking_notifier_chain_unregister(
+                        pm_qos_array[pm_qos_class]->notifiers, notifier);
+        return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
+#define PID_NAME_LEN sizeof("process_1234567890")
+static char name[PID_NAME_LEN];
+static int pm_qos_power_open(struct inode *inode, struct file *filp)
+{
+        int ret;
+        long pm_qos_class;
+        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
+        if (pm_qos_class >= 0) {
+                filp->private_data = (void *)pm_qos_class;
+                sprintf(name, "process_%d", current->pid);
+                ret = pm_qos_add_requirement(pm_qos_class, name,
+                                        PM_QOS_DEFAULT_VALUE);
+                if (ret >= 0)
+                        return 0;
+        }
+        return -EPERM;
+}
+static int pm_qos_power_release(struct inode *inode, struct file *filp)
+{
+        int pm_qos_class;
+        pm_qos_class = (long)filp->private_data;
+        sprintf(name, "process_%d", current->pid);
+        pm_qos_remove_requirement(pm_qos_class, name);
+        return 0;
+}
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+                size_t count, loff_t *f_pos)
+{
+        s32 value;
+        int pm_qos_class;
+        pm_qos_class = (long)filp->private_data;
+        if (count != sizeof(s32))
+                return -EINVAL;
+        if (copy_from_user(&value, buf, sizeof(s32)))
+                return -EFAULT;
+        sprintf(name, "process_%d", current->pid);
+        pm_qos_update_requirement(pm_qos_class, name, value);
+        return  sizeof(s32);
+}
+static int __init pm_qos_power_init(void)
+{
+        int ret = 0;
+        ret = register_pm_qos_misc(&cpu_dma_pm_qos);
+        if (ret < 0) {
+                printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
+                return ret;
+        }
+        ret = register_pm_qos_misc(&network_lat_pm_qos);
+        if (ret < 0) {
+                printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
+                return ret;
+        }
+        ret = register_pm_qos_misc(&network_throughput_pm_qos);
+        if (ret < 0)
+                printk(KERN_ERR
+                        "pm_qos_param: network_throughput setup failed\n");
+        return ret;
+}
+late_initcall(pm_qos_power_init);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0b7c82ac467e..2eae91f954ca 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -20,7 +20,7 @@ static int check_clock(const clockid_t which_clock)
                return 0;
        read_lock(&tasklist_lock);
-        p = find_task_by_pid(pid);
+        p = find_task_by_vpid(pid);
        if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
                   same_thread_group(p, current) : thread_group_leader(p))) {
                error = -EINVAL;
@@ -305,7 +305,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
                 */
                struct task_struct *p;
                rcu_read_lock();
-                p = find_task_by_pid(pid);
+                p = find_task_by_vpid(pid);
                if (p) {
                        if (CPUCLOCK_PERTHREAD(which_clock)) {
                                if (same_thread_group(p, current)) {
@@ -354,7 +354,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                if (pid == 0) {
                        p = current;
                } else {
-                        p = find_task_by_pid(pid);
+                        p = find_task_by_vpid(pid);
                        if (p && !same_thread_group(p, current))
                                p = NULL;
                }
@@ -362,7 +362,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                if (pid == 0) {
                        p = current->group_leader;
                } else {
-                        p = find_task_by_pid(pid);
+                        p = find_task_by_vpid(pid);
                        if (p && !thread_group_leader(p))
                                p = NULL;
                }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 36d563fd9e3b..a9b04203a66d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,8 +256,9 @@ static void schedule_next_timer(struct k_itimer *timr)
        if (timr->it.real.interval.tv64 == 0)
                return;
-        timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
+        timr->it_overrun += (unsigned int) hrtimer_forward(timer,
-                                            timr->it.real.interval);
+                                                timer->base->get_time(),
+                                                timr->it.real.interval);
        timr->it_overrun_last = timr->it_overrun;
        timr->it_overrun = -1;
@@ -386,7 +387,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
                                        now = ktime_add(now, kj);
                        }
 #endif
-                        timr->it_overrun +=
+                        timr->it_overrun += (unsigned int)
                                hrtimer_forward(timer, now,
                                                timr->it.real.interval);
                        ret = HRTIMER_RESTART;
@@ -403,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
        struct task_struct *rtn = current->group_leader;
        if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
-                (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
+                (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
                 !same_thread_group(rtn, current) ||
                 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
                return NULL;
@@ -662,7 +663,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
         */
        if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
            (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
-                timr->it_overrun += hrtimer_forward(timer, now, iv);
+                timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
        remaining = ktime_sub(timer->expires, now);
        /* Return 0 only, when the timer is expired and not pending */
@@ -766,9 +767,11 @@ common_timer_set(struct k_itimer *timr, int flags,
        /* SIGEV_NONE timers are not queued ! See common_timer_get */
        if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
                /* Setup correct expiry time for relative timers */
-                if (mode == HRTIMER_MODE_REL)
+                if (mode == HRTIMER_MODE_REL) {
-                        timer->expires = ktime_add(timer->expires,
+                        timer->expires =
-                                                   timer->base->get_time());
+                                ktime_add_safe(timer->expires,
+                                               timer->base->get_time());
+                }
                return 0;
        }
@@ -981,20 +984,9 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
 static int common_nsleep(const clockid_t which_clock, int flags,
                         struct timespec *tsave, struct timespec __user *rmtp)
 {
-        struct timespec rmt;
+        return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
-        int ret;
+                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+                                 which_clock);
-        ret = hrtimer_nanosleep(tsave, rmtp ? &rmt : NULL,
-                                flags & TIMER_ABSTIME ?
-                                HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
-                                which_clock);
-        if (ret && rmtp) {
-                if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
-                        return -EFAULT;
-        }
-        return ret;
 }
 asmlinkage long
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ef9b802738a5..79833170bb9c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -74,8 +74,8 @@ config PM_TRACE_RTC
        RTC across reboots, so that you can debug a machine that just hangs
        during suspend (or more commonly, during resume).
-        To use this debugging feature you should attempt to suspend the machine,
+        To use this debugging feature you should attempt to suspend the
-        then reboot it, then run
+        machine, reboot it and then run
                dmesg -s 1000000 | grep 'hash matches'
@@ -123,7 +123,10 @@ config HIBERNATION
          called "hibernation" in user interfaces.  STD checkpoints the
          system and powers it off; and restores that checkpoint on reboot.
-          You can suspend your machine with 'echo disk > /sys/power/state'.
+          You can suspend your machine with 'echo disk > /sys/power/state'
+          after placing resume=/dev/swappartition on the kernel command line
+          in your bootloader's configuration file.
          Alternatively, you can use the additional userland tools available
          from <http://suspend.sf.net>.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d09da0895174..859a8e59773a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -26,7 +26,7 @@
 static int noresume = 0;
-char resume_file[256] = CONFIG_PM_STD_PARTITION;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
@@ -185,7 +185,7 @@ static void platform_restore_cleanup(int platform_mode)
 *      reappears in this routine after a restore.
 */
-int create_image(int platform_mode)
+static int create_image(int platform_mode)
 {
        int error;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f6a5df934f8d..95250d7c8d91 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1203,7 +1203,7 @@ asmlinkage int swsusp_save(void)
        printk(KERN_INFO "PM: Creating hibernation image: \n");
-        drain_local_pages();
+        drain_local_pages(NULL);
        nr_pages = count_data_pages();
        nr_highmem = count_highmem_pages();
        printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
@@ -1221,7 +1221,7 @@ asmlinkage int swsusp_save(void)
        /* During allocating of suspend pagedir, new cold pages may appear.
         * Kill them.
         */
-        drain_local_pages();
+        drain_local_pages(NULL);
        copy_data_pages(&copy_bm, &orig_bm);
        /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 29ae1e99cde0..bee36100f110 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,7 +32,6 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
-#include <linux/jiffies.h>
 #include <asm/uaccess.h>
@@ -93,16 +92,16 @@ static int console_locked, console_suspended;
 */
 static DEFINE_SPINLOCK(logbuf_lock);
-#define LOG_BUF_MASK    (log_buf_len-1)
+#define LOG_BUF_MASK (log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
 /*
 * The indices into log_buf are not constrained to log_buf_len - they
 * must be masked before subscripting
 */
-static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */
+static unsigned log_start;      /* Index into log_buf: next char to be read by syslog() */
-static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */
+static unsigned con_start;      /* Index into log_buf: next char to be sent to consoles */
-static unsigned long log_end;   /* Index into log_buf: most-recently-written-char + 1 */
+static unsigned log_end;        /* Index into log_buf: most-recently-written-char + 1 */
 /*
 *      Array of consoles built from command line options (console=)
@@ -128,17 +127,17 @@ static int console_may_schedule;
 static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
-static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
+static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
 static int __init log_buf_len_setup(char *str)
 {
-        unsigned long size = memparse(str, &str);
+        unsigned size = memparse(str, &str);
        unsigned long flags;
        if (size)
                size = roundup_pow_of_two(size);
        if (size > log_buf_len) {
-                unsigned long start, dest_idx, offset;
+                unsigned start, dest_idx, offset;
                char *new_log_buf;
                new_log_buf = alloc_bootmem(size);
@@ -295,7 +294,7 @@ int log_buf_read(int idx)
 */
 int do_syslog(int type, char __user *buf, int len)
 {
-        unsigned long i, j, limit, count;
+        unsigned i, j, limit, count;
        int do_clear = 0;
        char c;
        int error = 0;
@@ -436,7 +435,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len)
 /*
 * Call the console drivers on a range of log_buf
 */
-static void __call_console_drivers(unsigned long start, unsigned long end)
+static void __call_console_drivers(unsigned start, unsigned end)
 {
        struct console *con;
@@ -463,8 +462,8 @@ early_param("ignore_loglevel", ignore_loglevel_setup);
 /*
 * Write out chars from start to end - 1 inclusive
 */
-static void _call_console_drivers(unsigned long start,
+static void _call_console_drivers(unsigned start,
-                                unsigned long end, int msg_log_level)
+                                unsigned end, int msg_log_level)
 {
        if ((msg_log_level < console_loglevel || ignore_loglevel) &&
                        console_drivers && start != end) {
@@ -484,12 +483,12 @@ static void _call_console_drivers(unsigned long start,
 * log_buf[start] to log_buf[end - 1].
 * The console_sem must be held.
 */
-static void call_console_drivers(unsigned long start, unsigned long end)
+static void call_console_drivers(unsigned start, unsigned end)
 {
-        unsigned long cur_index, start_print;
+        unsigned cur_index, start_print;
        static int msg_level = -1;
-        BUG_ON(((long)(start - end)) > 0);
+        BUG_ON(((int)(start - end)) > 0);
        cur_index = start;
        start_print = start;
@@ -567,19 +566,6 @@ static int printk_time = 0;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-static int __init printk_time_setup(char *str)
-{
-        if (*str)
-                return 0;
-        printk_time = 1;
-        printk(KERN_NOTICE "The 'time' option is deprecated and "
-                "is scheduled for removal in early 2008\n");
-        printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n");
-        return 1;
-}
-__setup("time", printk_time_setup);
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 {
@@ -790,7 +776,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len)
        return -ENOSYS;
 }
-static void call_console_drivers(unsigned long start, unsigned long end)
+static void call_console_drivers(unsigned start, unsigned end)
 {
 }
@@ -983,8 +969,8 @@ void wake_up_klogd(void)
 void release_console_sem(void)
 {
        unsigned long flags;
-        unsigned long _con_start, _log_end;
+        unsigned _con_start, _log_end;
-        unsigned long wake_klogd = 0;
+        unsigned wake_klogd = 0;
        if (console_suspended) {
                up(&secondary_console_sem);
@@ -1265,6 +1251,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
        return;
 }
+#if defined CONFIG_PRINTK
 /*
 * printk rate limiting, lifted from the networking subsystem.
 *
@@ -1275,7 +1262,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
        static DEFINE_SPINLOCK(ratelimit_lock);
-        static unsigned long toks = 10 * 5 * HZ;
+        static unsigned toks = 10 * 5 * HZ;
        static unsigned long last_msg;
        static int missed;
        unsigned long flags;
@@ -1334,3 +1321,4 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
        return false;
 }
 EXPORT_SYMBOL(printk_timed_ratelimit);
+#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index e64c2da11c0f..3b7a1b055122 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -20,7 +20,6 @@
 #include <linux/mm.h>
 #include <linux/cpumask.h>
 #include <linux/cpu.h>
-#include <linux/profile.h>
 #include <linux/highmem.h>
 #include <linux/mutex.h>
 #include <asm/sections.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b0d4ab4dfd3d..fdb34e86f923 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -20,6 +20,7 @@
 #include <linux/signal.h>
 #include <linux/audit.h>
 #include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -53,7 +54,7 @@ void ptrace_untrace(struct task_struct *child)
        spin_lock(&child->sighand->siglock);
        if (task_is_traced(child)) {
                if (child->signal->flags & SIGNAL_STOP_STOPPED) {
-                        child->state = TASK_STOPPED;
+                        __set_task_state(child, TASK_STOPPED);
                } else {
                        signal_wake_up(child, 1);
                }
@@ -98,23 +99,23 @@ int ptrace_check_attach(struct task_struct *child, int kill)
         * be changed by us so it's not changing right after this.
         */
        read_lock(&tasklist_lock);
-        if ((child->ptrace & PT_PTRACED) && child->parent == current &&
+        if ((child->ptrace & PT_PTRACED) && child->parent == current) {
-            (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
-            && child->signal != NULL) {
                ret = 0;
+                /*
+                 * child->sighand can't be NULL, release_task()
+                 * does ptrace_unlink() before __exit_signal().
+                 */
                spin_lock_irq(&child->sighand->siglock);
-                if (task_is_stopped(child)) {
+                if (task_is_stopped(child))
                        child->state = TASK_TRACED;
-                } else if (!task_is_traced(child) && !kill) {
+                else if (!task_is_traced(child) && !kill)
                        ret = -ESRCH;
-                }
                spin_unlock_irq(&child->sighand->siglock);
        }
        read_unlock(&tasklist_lock);
-        if (!ret && !kill) {
+        if (!ret && !kill)
                wait_task_inactive(child);
-        }
        /* All systems go.. */
        return ret;
@@ -201,8 +202,7 @@ repeat:
                goto bad;
        /* Go */
-        task->ptrace |= PT_PTRACED | ((task->real_parent != current)
+        task->ptrace |= PT_PTRACED;
-                                      ? PT_ATTACHED : 0);
        if (capable(CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 760dfc233a00..c09605f8d16c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
-/* Because of FASTCALL declaration of complete, we use this wrapper */
+/*
+ * Awaken the corresponding synchronize_rcu() instance now that a
+ * grace period has elapsed.
+ */
 static void wakeme_after_rcu(struct rcu_head  *head)
 {
        struct rcu_synchronize *rcu;
diff --git a/kernel/relay.c b/kernel/relay.c
index 7c0373322f18..d080b9d161a7 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -37,37 +37,31 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
 }
 /*
- * nopage() vm_op implementation for relay file mapping.
+ * fault() vm_op implementation for relay file mapping.
 */
-static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-                                     unsigned long address,
-                                     int *type)
 {
        struct page *page;
        struct rchan_buf *buf = vma->vm_private_data;
-        unsigned long offset = address - vma->vm_start;
+        pgoff_t pgoff = vmf->pgoff;
-        if (address > vma->vm_end)
-                return NOPAGE_SIGBUS; /* Disallow mremap */
        if (!buf)
-                return NOPAGE_OOM;
+                return VM_FAULT_OOM;
-        page = vmalloc_to_page(buf->start + offset);
+        page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT));
        if (!page)
-                return NOPAGE_OOM;
+                return VM_FAULT_SIGBUS;
        get_page(page);
+        vmf->page = page;
-        if (type)
+        return 0;
-                *type = VM_FAULT_MINOR;
-        return page;
 }
 /*
 * vm_ops for relay file mappings.
 */
 static struct vm_operations_struct relay_file_mmap_ops = {
-        .nopage = relay_buf_nopage,
+        .fault = relay_buf_fault,
        .close = relay_file_mmap_close,
 };
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
new file mode 100644
index 000000000000..16cbec2d5d60
--- /dev/null
+++ b/kernel/res_counter.c
@@ -0,0 +1,134 @@
+/*
+ * resource cgroups
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+#include <linux/types.h>
+#include <linux/parser.h>
+#include <linux/fs.h>
+#include <linux/res_counter.h>
+#include <linux/uaccess.h>
+void res_counter_init(struct res_counter *counter)
+{
+        spin_lock_init(&counter->lock);
+        counter->limit = (unsigned long long)LLONG_MAX;
+}
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+{
+        if (counter->usage + val > counter->limit) {
+                counter->failcnt++;
+                return -ENOMEM;
+        }
+        counter->usage += val;
+        return 0;
+}
+int res_counter_charge(struct res_counter *counter, unsigned long val)
+{
+        int ret;
+        unsigned long flags;
+        spin_lock_irqsave(&counter->lock, flags);
+        ret = res_counter_charge_locked(counter, val);
+        spin_unlock_irqrestore(&counter->lock, flags);
+        return ret;
+}
+void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+{
+        if (WARN_ON(counter->usage < val))
+                val = counter->usage;
+        counter->usage -= val;
+}
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&counter->lock, flags);
+        res_counter_uncharge_locked(counter, val);
+        spin_unlock_irqrestore(&counter->lock, flags);
+}
+static inline unsigned long long *
+res_counter_member(struct res_counter *counter, int member)
+{
+        switch (member) {
+        case RES_USAGE:
+                return &counter->usage;
+        case RES_LIMIT:
+                return &counter->limit;
+        case RES_FAILCNT:
+                return &counter->failcnt;
+        };
+        BUG();
+        return NULL;
+}
+ssize_t res_counter_read(struct res_counter *counter, int member,
+                const char __user *userbuf, size_t nbytes, loff_t *pos,
+                int (*read_strategy)(unsigned long long val, char *st_buf))
+{
+        unsigned long long *val;
+        char buf[64], *s;
+        s = buf;
+        val = res_counter_member(counter, member);
+        if (read_strategy)
+                s += read_strategy(*val, s);
+        else
+                s += sprintf(s, "%llu\n", *val);
+        return simple_read_from_buffer((void __user *)userbuf, nbytes,
+                        pos, buf, s - buf);
+}
+ssize_t res_counter_write(struct res_counter *counter, int member,
+                const char __user *userbuf, size_t nbytes, loff_t *pos,
+                int (*write_strategy)(char *st_buf, unsigned long long *val))
+{
+        int ret;
+        char *buf, *end;
+        unsigned long flags;
+        unsigned long long tmp, *val;
+        buf = kmalloc(nbytes + 1, GFP_KERNEL);
+        ret = -ENOMEM;
+        if (buf == NULL)
+                goto out;
+        buf[nbytes] = '\0';
+        ret = -EFAULT;
+        if (copy_from_user(buf, userbuf, nbytes))
+                goto out_free;
+        ret = -EINVAL;
+        if (write_strategy) {
+                if (write_strategy(buf, &tmp)) {
+                        goto out_free;
+                }
+        } else {
+                tmp = simple_strtoull(buf, &end, 10);
+                if (*end != '\0')
+                        goto out_free;
+        }
+        spin_lock_irqsave(&counter->lock, flags);
+        val = res_counter_member(counter, member);
+        *val = tmp;
+        spin_unlock_irqrestore(&counter->lock, flags);
+        ret = nbytes;
+out_free:
+        kfree(buf);
+out:
+        return ret;
+}
diff --git a/kernel/resource.c b/kernel/resource.c
index 2eb553d9b517..82aea814d409 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -228,7 +228,7 @@ int release_resource(struct resource *old)
 EXPORT_SYMBOL(release_resource);
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 /*
 * Finds the lowest memory reosurce exists within [res->start.res->end)
 * the caller must specify res->start, res->end, res->flags.
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 56d73cb8826d..5fcb4fe645e2 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -130,7 +130,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
        task = rt_mutex_owner(act_waiter->lock);
        if (task && task != current) {
-                act_waiter->deadlock_task_pid = task->pid;
+                act_waiter->deadlock_task_pid = get_pid(task_pid(task));
                act_waiter->deadlock_lock = lock;
        }
 }
@@ -142,9 +142,12 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
        if (!waiter->deadlock_lock || !rt_trace_on)
                return;
-        task = find_task_by_pid(waiter->deadlock_task_pid);
+        rcu_read_lock();
-        if (!task)
+        task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
+        if (!task) {
+                rcu_read_unlock();
                return;
+        }
        TRACE_OFF_NOLOCK();
@@ -173,6 +176,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
                current->comm, task_pid_nr(current));
        dump_stack();
        debug_show_all_locks();
+        rcu_read_unlock();
        printk("[ turning off deadlock detection."
               "Please report this trace. ]\n\n");
@@ -203,10 +207,12 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
        memset(waiter, 0x11, sizeof(*waiter));
        plist_node_init(&waiter->list_entry, MAX_PRIO);
        plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+        waiter->deadlock_task_pid = NULL;
 }
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
+        put_pid(waiter->deadlock_task_pid);
        TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
        TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
        TRACE_WARN_ON(waiter->task);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0deef71ff8d2..6522ae5b14a2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        set_current_state(state);
        /* Setup the timer, when timeout != NULL */
-        if (unlikely(timeout))
+        if (unlikely(timeout)) {
                hrtimer_start(&timeout->timer, timeout->timer.expires,
                              HRTIMER_MODE_ABS);
+                if (!hrtimer_active(&timeout->timer))
+                        timeout->task = NULL;
+        }
        for (;;) {
                /* Try to acquire the lock: */
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 2d3b83593ca3..e124bf5800ea 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -51,7 +51,7 @@ struct rt_mutex_waiter {
        struct rt_mutex         *lock;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
        unsigned long           ip;
-        pid_t                   deadlock_task_pid;
+        struct pid              *deadlock_task_pid;
        struct rt_mutex         *deadlock_lock;
 #endif
 };
diff --git a/kernel/sched.c b/kernel/sched.c
index 9474b23c28bf..f28f19e65b59 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
        struct list_head queue[MAX_RT_PRIO];
 };
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 #include <linux/cgroup.h>
@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
        struct cgroup_subsys_state css;
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
-        struct sched_rt_entity **rt_se;
-        struct rt_rq **rt_rq;
-        unsigned int rt_ratio;
        /*
         * shares assigned to a task group governs how much of cpu bandwidth
         * is allocated to the group. The more shares a group has, the more is
@@ -213,33 +210,46 @@ struct task_group {
         *
         */
        unsigned long shares;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        struct sched_rt_entity **rt_se;
+        struct rt_rq **rt_rq;
+        u64 rt_runtime;
+#endif
        struct rcu_head rcu;
        struct list_head list;
 };
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
 static struct rt_rq *init_rt_rq_p[NR_CPUS];
+#endif
-/* task_group_mutex serializes add/remove of task groups and also changes to
+/* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
 */
-static DEFINE_MUTEX(task_group_mutex);
+static DEFINE_SPINLOCK(task_group_lock);
 /* doms_cur_mutex serializes access to doms_cur[] array */
 static DEFINE_MUTEX(doms_cur_mutex);
+#ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
 /* kernel thread that runs rebalance_shares() periodically */
 static struct task_struct *lb_monitor_task;
@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused);
 static void set_se_shares(struct sched_entity *se, unsigned long shares);
+#ifdef CONFIG_USER_SCHED
+# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
+#else
+# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
+#endif
+#define MIN_GROUP_SHARES        2
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+#endif
 /* Default task group.
 *      Every task in system belong to this group at bootup.
 */
 struct task_group init_task_group = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
        .se     = init_sched_entity_p,
        .cfs_rq = init_cfs_rq_p,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
        .rt_se  = init_sched_rt_entity_p,
        .rt_rq  = init_rt_rq_p,
-};
-#ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
-#else
-# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 #endif
+};
-#define MIN_GROUP_SHARES        2
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
 {
        struct task_group *tg;
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
        tg = p->user->tg;
-#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+#elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
 #else
@@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
        p->se.parent = task_group(p)->se[cpu];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
        p->rt.parent = task_group(p)->rt_se[cpu];
-}
+#endif
-static inline void lock_task_group_list(void)
-{
-        mutex_lock(&task_group_mutex);
-}
-static inline void unlock_task_group_list(void)
-{
-        mutex_unlock(&task_group_mutex);
 }
 static inline void lock_doms_cur(void)
@@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void)
 #else
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_task_group_list(void) { }
-static inline void unlock_task_group_list(void) { }
 static inline void lock_doms_cur(void) { }
 static inline void unlock_doms_cur(void) { }
-#endif  /* CONFIG_FAIR_GROUP_SCHED */
+#endif  /* CONFIG_GROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -363,7 +370,7 @@ struct cfs_rq {
 struct rt_rq {
        struct rt_prio_array active;
        unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        int highest_prio; /* highest queued rt task prio */
 #endif
 #ifdef CONFIG_SMP
@@ -373,7 +380,9 @@ struct rt_rq {
        int rt_throttled;
        u64 rt_time;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+        unsigned long rt_nr_boosted;
        struct rq *rq;
        struct list_head leaf_rt_rq_list;
        struct task_group *tg;
@@ -447,6 +456,8 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
        struct list_head leaf_rt_rq_list;
 #endif
@@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features =
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
 * default: 1s
 */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+unsigned int sysctl_sched_rt_period = 1000000;
-#define SCHED_RT_FRAC_SHIFT     16
+/*
-#define SCHED_RT_FRAC           (1UL << SCHED_RT_FRAC_SHIFT)
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
 /*
- * ratio of time -rt tasks may consume.
+ * single value that denotes runtime == period, ie unlimited time.
- * default: 95%
 */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+#define RUNTIME_INF     ((u64)~0ULL)
 /*
 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1893,13 +1906,13 @@ out:
        return success;
 }
-int fastcall wake_up_process(struct task_struct *p)
+int wake_up_process(struct task_struct *p)
 {
        return try_to_wake_up(p, TASK_ALL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
-int fastcall wake_up_state(struct task_struct *p, unsigned int state)
+int wake_up_state(struct task_struct *p, unsigned int state)
 {
        return try_to_wake_up(p, state, 0);
 }
@@ -1986,7 +1999,7 @@ void sched_fork(struct task_struct *p, int clone_flags)
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
-void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
@@ -3753,7 +3766,7 @@ void scheduler_tick(void)
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
-void fastcall add_preempt_count(int val)
+void add_preempt_count(int val)
 {
        /*
         * Underflow?
@@ -3769,7 +3782,7 @@ void fastcall add_preempt_count(int val)
 }
 EXPORT_SYMBOL(add_preempt_count);
-void fastcall sub_preempt_count(int val)
+void sub_preempt_count(int val)
 {
        /*
         * Underflow?
@@ -4067,7 +4080,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
 * @key: is directly passed to the wakeup function
 */
-void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
 {
        unsigned long flags;
@@ -4081,7 +4094,7 @@ EXPORT_SYMBOL(__wake_up);
 /*
 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
 */
-void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
        __wake_up_common(q, mode, 1, 0, NULL);
 }
@@ -4099,7 +4112,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 *
 * On UP it can prevent extra preemption.
 */
-void fastcall
+void
 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
        unsigned long flags;
@@ -4571,6 +4584,15 @@ recheck:
                        return -EPERM;
        }
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Do not allow realtime tasks into groups that have no runtime
+         * assigned.
+         */
+        if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+                return -EPERM;
+#endif
        retval = security_task_setscheduler(p, policy, param);
        if (retval)
                return retval;
@@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        /* delimiter for bitsearch: */
        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
@@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+        rt_rq->rt_nr_boosted = 0;
        rt_rq->rq = rq;
 #endif
 }
@@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
        se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
        se->parent = NULL;
 }
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
                struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
                int cpu, int add)
@@ -7175,7 +7200,7 @@ void __init sched_init(void)
        init_defrootdomain();
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
 #endif
@@ -7196,7 +7221,10 @@ void __init sched_init(void)
                                &per_cpu(init_cfs_rq, i),
                                &per_cpu(init_sched_entity, i), i, 1);
-                init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+                init_task_group.rt_runtime =
+                        sysctl_sched_rt_runtime * NSEC_PER_USEC;
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
                init_tg_rt_entry(rq, &init_task_group,
                                &per_cpu(init_rt_rq, i),
@@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void)
        unsigned long flags;
        struct rq *rq;
-        read_lock_irq(&tasklist_lock);
+        read_lock_irqsave(&tasklist_lock, flags);
        do_each_thread(g, p) {
                /*
                 * Only normalize user tasks:
@@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void)
                        continue;
                }
-                spin_lock_irqsave(&p->pi_lock, flags);
+                spin_lock(&p->pi_lock);
                rq = __task_rq_lock(p);
                normalize_task(rq, p);
                __task_rq_unlock(rq);
-                spin_unlock_irqrestore(&p->pi_lock, flags);
+                spin_unlock(&p->pi_lock);
        } while_each_thread(g, p);
-        read_unlock_irq(&tasklist_lock);
+        read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
@@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p)
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
-#ifdef CONFIG_SMP
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
 /*
 * distribute shares of all task groups among their schedulable entities,
 * to reflect load distribution across cpus.
@@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused)
 }
 #endif  /* CONFIG_SMP */
-static void free_sched_group(struct task_group *tg)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void free_fair_sched_group(struct task_group *tg)
 {
        int i;
@@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg)
                        kfree(tg->cfs_rq[i]);
                if (tg->se)
                        kfree(tg->se[i]);
-                if (tg->rt_rq)
-                        kfree(tg->rt_rq[i]);
-                if (tg->rt_se)
-                        kfree(tg->rt_se[i]);
        }
        kfree(tg->cfs_rq);
        kfree(tg->se);
-        kfree(tg->rt_rq);
-        kfree(tg->rt_se);
-        kfree(tg);
 }
-/* allocate runqueue etc for a new task group */
+static int alloc_fair_sched_group(struct task_group *tg)
-struct task_group *sched_create_group(void)
 {
-        struct task_group *tg;
        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
-        struct rt_rq *rt_rq;
-        struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
-        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
-        if (!tg)
-                return ERR_PTR(-ENOMEM);
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
        if (!tg->cfs_rq)
                goto err;
        tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
        if (!tg->se)
                goto err;
-        tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
-        if (!tg->rt_rq)
-                goto err;
-        tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
-        if (!tg->rt_se)
-                goto err;
        tg->shares = NICE_0_LOAD;
-        tg->rt_ratio = 0; /* XXX */
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
@@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void)
                if (!se)
                        goto err;
+                init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+        }
+        return 1;
+ err:
+        return 0;
+}
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+        list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
+                        &cpu_rq(cpu)->leaf_cfs_rq_list);
+}
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+        list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+}
+#else
+static inline void free_fair_sched_group(struct task_group *tg)
+{
+}
+static inline int alloc_fair_sched_group(struct task_group *tg)
+{
+        return 1;
+}
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                if (tg->rt_rq)
+                        kfree(tg->rt_rq[i]);
+                if (tg->rt_se)
+                        kfree(tg->rt_se[i]);
+        }
+        kfree(tg->rt_rq);
+        kfree(tg->rt_se);
+}
+static int alloc_rt_sched_group(struct task_group *tg)
+{
+        struct rt_rq *rt_rq;
+        struct sched_rt_entity *rt_se;
+        struct rq *rq;
+        int i;
+        tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+        if (!tg->rt_rq)
+                goto err;
+        tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+        if (!tg->rt_se)
+                goto err;
+        tg->rt_runtime = 0;
+        for_each_possible_cpu(i) {
+                rq = cpu_rq(i);
                rt_rq = kmalloc_node(sizeof(struct rt_rq),
                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
                if (!rt_rq)
@@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void)
                if (!rt_se)
                        goto err;
-                init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
                init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
        }
-        lock_task_group_list();
+        return 1;
+ err:
+        return 0;
+}
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+        list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
+                        &cpu_rq(cpu)->leaf_rt_rq_list);
+}
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+        list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
+}
+#else
+static inline void free_rt_sched_group(struct task_group *tg)
+{
+}
+static inline int alloc_rt_sched_group(struct task_group *tg)
+{
+        return 1;
+}
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+static void free_sched_group(struct task_group *tg)
+{
+        free_fair_sched_group(tg);
+        free_rt_sched_group(tg);
+        kfree(tg);
+}
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(void)
+{
+        struct task_group *tg;
+        unsigned long flags;
+        int i;
+        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+        if (!tg)
+                return ERR_PTR(-ENOMEM);
+        if (!alloc_fair_sched_group(tg))
+                goto err;
+        if (!alloc_rt_sched_group(tg))
+                goto err;
+        spin_lock_irqsave(&task_group_lock, flags);
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
+                register_fair_sched_group(tg, i);
-                cfs_rq = tg->cfs_rq[i];
+                register_rt_sched_group(tg, i);
-                list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
-                rt_rq = tg->rt_rq[i];
-                list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
        }
        list_add_rcu(&tg->list, &task_groups);
-        unlock_task_group_list();
+        spin_unlock_irqrestore(&task_group_lock, flags);
        return tg;
@@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
-        struct cfs_rq *cfs_rq = NULL;
+        unsigned long flags;
-        struct rt_rq *rt_rq = NULL;
        int i;
-        lock_task_group_list();
+        spin_lock_irqsave(&task_group_lock, flags);
        for_each_possible_cpu(i) {
-                cfs_rq = tg->cfs_rq[i];
+                unregister_fair_sched_group(tg, i);
-                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                unregister_rt_sched_group(tg, i);
-                rt_rq = tg->rt_rq[i];
-                list_del_rcu(&rt_rq->leaf_rt_rq_list);
        }
        list_del_rcu(&tg->list);
-        unlock_task_group_list();
+        spin_unlock_irqrestore(&task_group_lock, flags);
-        BUG_ON(!cfs_rq);
        /* wait for possible concurrent references to cfs_rqs complete */
        call_rcu(&tg->rcu, free_sched_group_rcu);
@@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk)
        task_rq_unlock(rq, &flags);
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
@@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
        }
 }
+static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
        int i;
-        struct cfs_rq *cfs_rq;
+        unsigned long flags;
-        struct rq *rq;
-        lock_task_group_list();
+        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
                goto done;
@@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * load_balance_fair) from referring to this group first,
         * by taking it off the rq->leaf_cfs_rq_list on each cpu.
         */
-        for_each_possible_cpu(i) {
+        spin_lock_irqsave(&task_group_lock, flags);
-                cfs_rq = tg->cfs_rq[i];
+        for_each_possible_cpu(i)
-                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                unregister_fair_sched_group(tg, i);
-        }
+        spin_unlock_irqrestore(&task_group_lock, flags);
        /* wait for any ongoing reference to this group to finish */
        synchronize_sched();
@@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * Enable load balance activity on this group, by inserting it back on
         * each cpu's rq->leaf_cfs_rq_list.
         */
-        for_each_possible_cpu(i) {
+        spin_lock_irqsave(&task_group_lock, flags);
-                rq = cpu_rq(i);
+        for_each_possible_cpu(i)
-                cfs_rq = tg->cfs_rq[i];
+                register_fair_sched_group(tg, i);
-                list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+        spin_unlock_irqrestore(&task_group_lock, flags);
-        }
 done:
-        unlock_task_group_list();
+        mutex_unlock(&shares_mutex);
        return 0;
 }
@@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg)
 {
        return tg->shares;
 }
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 /*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
 */
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+        if (runtime == RUNTIME_INF)
+                return 1ULL << 16;
+        runtime *= (1ULL << 16);
+        div64_64(runtime, period);
+        return runtime;
+}
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
        struct task_group *tgi;
        unsigned long total = 0;
+        unsigned long global_ratio =
+                to_ratio(sysctl_sched_rt_period,
+                         sysctl_sched_rt_runtime < 0 ?
+                                RUNTIME_INF : sysctl_sched_rt_runtime);
        rcu_read_lock();
-        list_for_each_entry_rcu(tgi, &task_groups, list)
+        list_for_each_entry_rcu(tgi, &task_groups, list) {
-                total += tgi->rt_ratio;
+                if (tgi == tg)
-        rcu_read_unlock();
+                        continue;
-        if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
+                total += to_ratio(period, tgi->rt_runtime);
-                return -EINVAL;
+        }
+        rcu_read_unlock();
-        tg->rt_ratio = rt_ratio;
+        return total + to_ratio(period, runtime) < global_ratio;
-        return 0;
 }
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
-        return tg->rt_ratio;
+        u64 rt_runtime, rt_period;
+        int err = 0;
+        rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+        if (rt_runtime_us == -1)
+                rt_runtime = rt_period;
+        mutex_lock(&rt_constraints_mutex);
+        if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
+                err = -EINVAL;
+                goto unlock;
+        }
+        if (rt_runtime_us == -1)
+                rt_runtime = RUNTIME_INF;
+        tg->rt_runtime = rt_runtime;
+ unlock:
+        mutex_unlock(&rt_constraints_mutex);
+        return err;
 }
-#endif  /* CONFIG_FAIR_GROUP_SCHED */
+long sched_group_rt_runtime(struct task_group *tg)
+{
+        u64 rt_runtime_us;
+        if (tg->rt_runtime == RUNTIME_INF)
+                return -1;
+        rt_runtime_us = tg->rt_runtime;
+        do_div(rt_runtime_us, NSEC_PER_USEC);
+        return rt_runtime_us;
+}
+#endif
+#endif  /* CONFIG_GROUP_SCHED */
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 /* return corresponding task_group object of a cgroup */
 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7857,9 +8037,15 @@ static int
 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                      struct task_struct *tsk)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+        /* Don't accept realtime tasks when there is no way for them to run */
+        if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+                return -EINVAL;
+#else
        /* We don't support RT-tasks being in separate groups */
        if (tsk->sched_class != &fair_sched_class)
                return -EINVAL;
+#endif
        return 0;
 }
@@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        sched_move_task(tsk);
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
 {
@@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
        return (u64) tg->shares;
 }
+#endif
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+#ifdef CONFIG_RT_GROUP_SCHED
-                u64 rt_ratio_val)
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+                                struct file *file,
+                                const char __user *userbuf,
+                                size_t nbytes, loff_t *unused_ppos)
 {
-        return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+        char buffer[64];
+        int retval = 0;
+        s64 val;
+        char *end;
+        if (!nbytes)
+                return -EINVAL;
+        if (nbytes >= sizeof(buffer))
+                return -E2BIG;
+        if (copy_from_user(buffer, userbuf, nbytes))
+                return -EFAULT;
+        buffer[nbytes] = 0;     /* nul-terminate */
+        /* strip newline if necessary */
+        if (nbytes && (buffer[nbytes-1] == '\n'))
+                buffer[nbytes-1] = 0;
+        val = simple_strtoll(buffer, &end, 0);
+        if (*end)
+                return -EINVAL;
+        /* Pass to subsystem */
+        retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+        if (!retval)
+                retval = nbytes;
+        return retval;
 }
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+                                   struct file *file,
+                                   char __user *buf, size_t nbytes,
+                                   loff_t *ppos)
 {
-        struct task_group *tg = cgroup_tg(cgrp);
+        char tmp[64];
+        long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+        int len = sprintf(tmp, "%ld\n", val);
-        return (u64) tg->rt_ratio;
+        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
+#endif
 static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
        {
                .name = "shares",
                .read_uint = cpu_shares_read_uint,
                .write_uint = cpu_shares_write_uint,
        },
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
        {
-                .name = "rt_ratio",
+                .name = "rt_runtime_us",
-                .read_uint = cpu_rt_ratio_read_uint,
+                .read = cpu_rt_runtime_read,
-                .write_uint = cpu_rt_ratio_write_uint,
+                .write = cpu_rt_runtime_write,
        },
+#endif
 };
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .early_init     = 1,
 };
-#endif  /* CONFIG_FAIR_CGROUP_SCHED */
+#endif  /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_CGROUP_CPUACCT
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 274b40d7bef2..f54792b175b2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
        return !list_empty(&rt_se->run_list);
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
        if (!rt_rq->tg)
-                return SCHED_RT_FRAC;
+                return RUNTIME_INF;
-        return rt_rq->tg->rt_ratio;
+        return rt_rq->tg->rt_runtime;
 }
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
        struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
        }
 }
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
        struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
                dequeue_rt_entity(rt_se);
 }
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+        struct rt_rq *rt_rq = group_rt_rq(rt_se);
+        struct task_struct *p;
+        if (rt_rq)
+                return !!rt_rq->rt_nr_boosted;
+        p = rt_task_of(rt_se);
+        return p->prio != p->normal_prio;
+}
 #else
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
-        return sysctl_sched_rt_ratio;
+        if (sysctl_sched_rt_runtime == -1)
+                return RUNTIME_INF;
+        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
        return NULL;
 }
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
 }
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 }
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled;
+}
 #endif
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
        struct rt_rq *rt_rq = group_rt_rq(rt_se);
        if (rt_rq)
@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
        return rt_task_of(rt_se)->prio;
 }
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
-        unsigned int rt_ratio = sched_rt_ratio(rt_rq);
+        u64 runtime = sched_rt_runtime(rt_rq);
-        u64 period, ratio;
-        if (rt_ratio == SCHED_RT_FRAC)
+        if (runtime == RUNTIME_INF)
                return 0;
        if (rt_rq->rt_throttled)
-                return 1;
+                return rt_rq_throttled(rt_rq);
-        period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-        ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-        if (rt_rq->rt_time > ratio) {
+        if (rt_rq->rt_time > runtime) {
                struct rq *rq = rq_of_rt_rq(rt_rq);
                rq->rt_throttled = 1;
                rt_rq->rt_throttled = 1;
-                sched_rt_ratio_dequeue(rt_rq);
+                if (rt_rq_throttled(rt_rq)) {
-                return 1;
+                        sched_rt_rq_dequeue(rt_rq);
+                        return 1;
+                }
        }
        return 0;
@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
        u64 period;
        while (rq->clock > rq->rt_period_expire) {
-                period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+                period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
                rq->rt_period_expire += period;
                for_each_leaf_rt_rq(rt_rq, rq) {
-                        unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+                        u64 runtime = sched_rt_runtime(rt_rq);
-                        u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-                        rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+                        rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
-                        if (rt_rq->rt_throttled) {
+                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                                rt_rq->rt_throttled = 0;
-                                sched_rt_ratio_enqueue(rt_rq);
+                                sched_rt_rq_enqueue(rt_rq);
                        }
                }
@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
        cpuacct_charge(curr, delta_exec);
        rt_rq->rt_time += delta_exec;
-        /*
+        if (sched_rt_runtime_exceeded(rt_rq))
-         * might make it a tad more accurate:
-         *
-         * update_sched_rt_period(rq);
-         */
-        if (sched_rt_ratio_exceeded(rt_rq))
                resched_task(curr);
 }
@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        if (rt_se_prio(rt_se) < rt_rq->highest_prio)
                rt_rq->highest_prio = rt_se_prio(rt_se);
 #endif
@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        if (rt_se_boosted(rt_se))
+                rt_rq->rt_nr_boosted++;
+#endif
 }
 static inline
@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
        rt_rq->rt_nr_running--;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        if (rt_rq->rt_nr_running) {
                struct rt_prio_array *array;
@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_RT_GROUP_SCHED
+        if (rt_se_boosted(rt_se))
+                rt_rq->rt_nr_boosted--;
+        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
 }
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
        struct rt_prio_array *array = &rt_rq->active;
        struct rt_rq *group_rq = group_rt_rq(rt_se);
-        if (group_rq && group_rq->rt_throttled)
+        if (group_rq && rt_rq_throttled(group_rq))
                return;
        list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
        if (unlikely(!rt_rq->rt_nr_running))
                return NULL;
-        if (sched_rt_ratio_exceeded(rt_rq))
+        if (rt_rq_throttled(rt_rq))
                return NULL;
        do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 4333b6dbb424..84917fe507f7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -911,27 +911,6 @@ __group_complete_signal(int sig, struct task_struct *p)
                        } while_each_thread(p, t);
                        return;
                }
-                /*
-                 * There will be a core dump.  We make all threads other
-                 * than the chosen one go into a group stop so that nothing
-                 * happens until it gets scheduled, takes the signal off
-                 * the shared queue, and does the core dump.  This is a
-                 * little more complicated than strictly necessary, but it
-                 * keeps the signal state that winds up in the core dump
-                 * unchanged from the death state, e.g. which thread had
-                 * the core-dump signal unblocked.
-                 */
-                rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-                rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
-                p->signal->group_stop_count = 0;
-                p->signal->group_exit_task = t;
-                p = t;
-                do {
-                        p->signal->group_stop_count++;
-                        signal_wake_up(t, t == p);
-                } while_each_thread(p, t);
-                return;
        }
        /*
@@ -978,7 +957,6 @@ void zap_other_threads(struct task_struct *p)
 {
        struct task_struct *t;
-        p->signal->flags = SIGNAL_GROUP_EXIT;
        p->signal->group_stop_count = 0;
        for (t = next_thread(p); t != p; t = next_thread(t)) {
@@ -994,7 +972,7 @@ void zap_other_threads(struct task_struct *p)
        }
 }
-int fastcall __fatal_signal_pending(struct task_struct *tsk)
+int __fatal_signal_pending(struct task_struct *tsk)
 {
        return sigismember(&tsk->pending.signal, SIGKILL);
 }
@@ -1040,7 +1018,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 }
 /*
- * kill_pgrp_info() sends a signal to a process group: this is what the tty
+ * __kill_pgrp_info() sends a signal to a process group: this is what the tty
 * control characters do (^C, ^Z etc)
 */
@@ -1059,30 +1037,28 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
        return success ? 0 : retval;
 }
-int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
-{
-        int retval;
-        read_lock(&tasklist_lock);
-        retval = __kill_pgrp_info(sig, info, pgrp);
-        read_unlock(&tasklist_lock);
-        return retval;
-}
 int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
 {
-        int error;
+        int error = -ESRCH;
        struct task_struct *p;
        rcu_read_lock();
        if (unlikely(sig_needs_tasklist(sig)))
                read_lock(&tasklist_lock);
+retry:
        p = pid_task(pid, PIDTYPE_PID);
-        error = -ESRCH;
+        if (p) {
-        if (p)
                error = group_send_sig_info(sig, info, p);
+                if (unlikely(error == -ESRCH))
+                        /*
+                         * The task was unhashed in between, try again.
+                         * If it is dead, pid_task() will return NULL,
+                         * if we race with de_thread() it will find the
+                         * new leader.
+                         */
+                        goto retry;
+        }
        if (unlikely(sig_needs_tasklist(sig)))
                read_unlock(&tasklist_lock);
@@ -1147,14 +1123,22 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 static int kill_something_info(int sig, struct siginfo *info, int pid)
 {
        int ret;
-        rcu_read_lock();
-        if (!pid) {
+        if (pid > 0) {
-                ret = kill_pgrp_info(sig, info, task_pgrp(current));
+                rcu_read_lock();
-        } else if (pid == -1) {
+                ret = kill_pid_info(sig, info, find_vpid(pid));
+                rcu_read_unlock();
+                return ret;
+        }
+        read_lock(&tasklist_lock);
+        if (pid != -1) {
+                ret = __kill_pgrp_info(sig, info,
+                                pid ? find_vpid(-pid) : task_pgrp(current));
+        } else {
                int retval = 0, count = 0;
                struct task_struct * p;
-                read_lock(&tasklist_lock);
                for_each_process(p) {
                        if (p->pid > 1 && !same_thread_group(p, current)) {
                                int err = group_send_sig_info(sig, info, p);
@@ -1163,14 +1147,10 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
                                        retval = err;
                        }
                }
-                read_unlock(&tasklist_lock);
                ret = count ? retval : -ESRCH;
-        } else if (pid < 0) {
-                ret = kill_pgrp_info(sig, info, find_vpid(-pid));
-        } else {
-                ret = kill_pid_info(sig, info, find_vpid(pid));
        }
-        rcu_read_unlock();
+        read_unlock(&tasklist_lock);
        return ret;
 }
@@ -1218,20 +1198,6 @@ send_sig(int sig, struct task_struct *p, int priv)
        return send_sig_info(sig, __si_special(priv), p);
 }
-/*
- * This is the entry point for "process-wide" signals.
- * They will go to an appropriate thread in the thread group.
- */
-int
-send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
-{
-        int ret;
-        read_lock(&tasklist_lock);
-        ret = group_send_sig_info(sig, info, p);
-        read_unlock(&tasklist_lock);
-        return ret;
-}
 void
 force_sig(int sig, struct task_struct *p)
 {
@@ -1259,7 +1225,13 @@ force_sigsegv(int sig, struct task_struct *p)
 int kill_pgrp(struct pid *pid, int sig, int priv)
 {
-        return kill_pgrp_info(sig, __si_special(priv), pid);
+        int ret;
+        read_lock(&tasklist_lock);
+        ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+        read_unlock(&tasklist_lock);
+        return ret;
 }
 EXPORT_SYMBOL(kill_pgrp);
@@ -1578,11 +1550,6 @@ static inline int may_ptrace_stop(void)
 {
        if (!likely(current->ptrace & PT_PTRACED))
                return 0;
-        if (unlikely(current->parent == current->real_parent &&
-                    (current->ptrace & PT_ATTACHED)))
-                return 0;
        /*
         * Are we in the middle of do_coredump?
         * If so and our tracer is also part of the coredump stopping
@@ -1600,6 +1567,17 @@ static inline int may_ptrace_stop(void)
 }
 /*
+ * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Called with the siglock held.
+ */
+static int sigkill_pending(struct task_struct *tsk)
+{
+        return ((sigismember(&tsk->pending.signal, SIGKILL) ||
+                 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
+                !unlikely(sigismember(&tsk->blocked, SIGKILL)));
+}
+/*
 * This must be called with current->sighand->siglock held.
 *
 * This should be the path for all ptrace stops.
@@ -1607,11 +1585,31 @@ static inline int may_ptrace_stop(void)
 * That makes it a way to test a stopped process for
 * being ptrace-stopped vs being job-control-stopped.
 *
- * If we actually decide not to stop at all because the tracer is gone,
+ * If we actually decide not to stop at all because the tracer
- * we leave nostop_code in current->exit_code.
+ * is gone, we keep current->exit_code unless clear_code.
 */
-static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 {
+        int killed = 0;
+        if (arch_ptrace_stop_needed(exit_code, info)) {
+                /*
+                 * The arch code has something special to do before a
+                 * ptrace stop.  This is allowed to block, e.g. for faults
+                 * on user stack pages.  We can't keep the siglock while
+                 * calling arch_ptrace_stop, so we must release it now.
+                 * To preserve proper semantics, we must do this before
+                 * any signal bookkeeping like checking group_stop_count.
+                 * Meanwhile, a SIGKILL could come in before we retake the
+                 * siglock.  That must prevent us from sleeping in TASK_TRACED.
+                 * So after regaining the lock, we must check for SIGKILL.
+                 */
+                spin_unlock_irq(&current->sighand->siglock);
+                arch_ptrace_stop(exit_code, info);
+                spin_lock_irq(&current->sighand->siglock);
+                killed = sigkill_pending(current);
+        }
        /*
         * If there is a group stop in progress,
         * we must participate in the bookkeeping.
@@ -1623,22 +1621,23 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
        current->exit_code = exit_code;
        /* Let the debugger run.  */
-        set_current_state(TASK_TRACED);
+        __set_current_state(TASK_TRACED);
        spin_unlock_irq(&current->sighand->siglock);
        try_to_freeze();
        read_lock(&tasklist_lock);
-        if (may_ptrace_stop()) {
+        if (!unlikely(killed) && may_ptrace_stop()) {
                do_notify_parent_cldstop(current, CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
        } else {
                /*
                 * By the time we got the lock, our tracer went away.
-                 * Don't stop here.
+                 * Don't drop the lock yet, another tracer may come.
                 */
+                __set_current_state(TASK_RUNNING);
+                if (clear_code)
+                        current->exit_code = 0;
                read_unlock(&tasklist_lock);
-                set_current_state(TASK_RUNNING);
-                current->exit_code = nostop_code;
        }
        /*
@@ -1671,7 +1670,7 @@ void ptrace_notify(int exit_code)
        /* Let the debugger run.  */
        spin_lock_irq(&current->sighand->siglock);
-        ptrace_stop(exit_code, 0, &info);
+        ptrace_stop(exit_code, 1, &info);
        spin_unlock_irq(&current->sighand->siglock);
 }
@@ -1709,9 +1708,6 @@ static int do_signal_stop(int signr)
        struct signal_struct *sig = current->signal;
        int stop_count;
-        if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
-                return 0;
        if (sig->group_stop_count > 0) {
                /*
                 * There is a group stop in progress.  We don't need to
@@ -1719,12 +1715,15 @@ static int do_signal_stop(int signr)
                 */
                stop_count = --sig->group_stop_count;
        } else {
+                struct task_struct *t;
+                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+                    unlikely(sig->group_exit_task))
+                        return 0;
                /*
                 * There is no group stop already in progress.
                 * We must initiate one now.
                 */
-                struct task_struct *t;
                sig->group_exit_code = signr;
                stop_count = 0;
@@ -1734,7 +1733,7 @@ static int do_signal_stop(int signr)
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
-                        if (!t->exit_state &&
+                        if (!(t->flags & PF_EXITING) &&
                            !task_is_stopped_or_traced(t)) {
                                stop_count++;
                                signal_wake_up(t, 0);
@@ -1752,47 +1751,6 @@ static int do_signal_stop(int signr)
        return 1;
 }
-/*
- * Do appropriate magic when group_stop_count > 0.
- * We return nonzero if we stopped, after releasing the siglock.
- * We return zero if we still hold the siglock and should look
- * for another signal without checking group_stop_count again.
- */
-static int handle_group_stop(void)
-{
-        int stop_count;
-        if (current->signal->group_exit_task == current) {
-                /*
-                 * Group stop is so we can do a core dump,
-                 * We are the initiating thread, so get on with it.
-                 */
-                current->signal->group_exit_task = NULL;
-                return 0;
-        }
-        if (current->signal->flags & SIGNAL_GROUP_EXIT)
-                /*
-                 * Group stop is so another thread can do a core dump,
-                 * or else we are racing against a death signal.
-                 * Just punt the stop so we can get the next signal.
-                 */
-                return 0;
-        /*
-         * There is a group stop in progress.  We stop
-         * without any associated signal being in our queue.
-         */
-        stop_count = --current->signal->group_stop_count;
-        if (stop_count == 0)
-                current->signal->flags = SIGNAL_STOP_STOPPED;
-        current->exit_code = current->signal->group_exit_code;
-        set_current_state(TASK_STOPPED);
-        spin_unlock_irq(&current->sighand->siglock);
-        finish_stop(stop_count);
-        return 1;
-}
 int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
                          struct pt_regs *regs, void *cookie)
 {
@@ -1807,7 +1765,7 @@ relock:
                struct k_sigaction *ka;
                if (unlikely(current->signal->group_stop_count > 0) &&
-                    handle_group_stop())
+                    do_signal_stop(0))
                        goto relock;
                signr = dequeue_signal(current, mask, info);
@@ -1819,7 +1777,7 @@ relock:
                        ptrace_signal_deliver(regs, cookie);
                        /* Let the debugger run.  */
-                        ptrace_stop(signr, signr, info);
+                        ptrace_stop(signr, 0, info);
                        /* We're back.  Did the debugger cancel the sig?  */
                        signr = current->exit_code;
@@ -1936,6 +1894,48 @@ relock:
        return signr;
 }
+void exit_signals(struct task_struct *tsk)
+{
+        int group_stop = 0;
+        struct task_struct *t;
+        if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
+                tsk->flags |= PF_EXITING;
+                return;
+        }
+        spin_lock_irq(&tsk->sighand->siglock);
+        /*
+         * From now this task is not visible for group-wide signals,
+         * see wants_signal(), do_signal_stop().
+         */
+        tsk->flags |= PF_EXITING;
+        if (!signal_pending(tsk))
+                goto out;
+        /* It could be that __group_complete_signal() choose us to
+         * notify about group-wide signal. Another thread should be
+         * woken now to take the signal since we will not.
+         */
+        for (t = tsk; (t = next_thread(t)) != tsk; )
+                if (!signal_pending(t) && !(t->flags & PF_EXITING))
+                        recalc_sigpending_and_wake(t);
+        if (unlikely(tsk->signal->group_stop_count) &&
+                        !--tsk->signal->group_stop_count) {
+                tsk->signal->flags = SIGNAL_STOP_STOPPED;
+                group_stop = 1;
+        }
+out:
+        spin_unlock_irq(&tsk->sighand->siglock);
+        if (unlikely(group_stop)) {
+                read_lock(&tasklist_lock);
+                do_notify_parent_cldstop(tsk, CLD_STOPPED);
+                read_unlock(&tasklist_lock);
+        }
+}
 EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7837d45419e..5b3aea5f471e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -320,7 +320,7 @@ void irq_exit(void)
 /*
 * This function must run with irqs disabled!
 */
-inline fastcall void raise_softirq_irqoff(unsigned int nr)
+inline void raise_softirq_irqoff(unsigned int nr)
 {
        __raise_softirq_irqoff(nr);
@@ -337,7 +337,7 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
                wakeup_softirqd();
 }
-void fastcall raise_softirq(unsigned int nr)
+void raise_softirq(unsigned int nr)
 {
        unsigned long flags;
@@ -363,7 +363,7 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
-void fastcall __tasklet_schedule(struct tasklet_struct *t)
+void __tasklet_schedule(struct tasklet_struct *t)
 {
        unsigned long flags;
@@ -376,7 +376,7 @@ void fastcall __tasklet_schedule(struct tasklet_struct *t)
 EXPORT_SYMBOL(__tasklet_schedule);
-void fastcall __tasklet_hi_schedule(struct tasklet_struct *t)
+void __tasklet_hi_schedule(struct tasklet_struct *t)
 {
        unsigned long flags;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 3507cabe963b..b0aeeaf22ce4 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -74,7 +74,7 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 * severe errors when invoked on an active srcu_struct.  That said, it
 * can be useful as an error check at cleanup time.
 */
-int srcu_readers_active(struct srcu_struct *sp)
+static int srcu_readers_active(struct srcu_struct *sp)
 {
        return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
 }
@@ -255,4 +255,3 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
 EXPORT_SYMBOL_GPL(srcu_read_unlock);
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
-EXPORT_SYMBOL_GPL(srcu_readers_active);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51b5ee53571a..6f4e0e13f70c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -29,7 +29,6 @@ enum stopmachine_state {
 static enum stopmachine_state stopmachine_state;
 static unsigned int stopmachine_num_threads;
 static atomic_t stopmachine_thread_ack;
-static DECLARE_MUTEX(stopmachine_mutex);
 static int stopmachine(void *cpu)
 {
@@ -170,6 +169,7 @@ static int do_stop(void *_smdata)
 struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
                                       unsigned int cpu)
 {
+        static DEFINE_MUTEX(stopmachine_mutex);
        struct stop_machine_data smdata;
        struct task_struct *p;
@@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
        smdata.data = data;
        init_completion(&smdata.done);
-        down(&stopmachine_mutex);
+        mutex_lock(&stopmachine_mutex);
        /* If they don't care which CPU fn runs on, bind to any online one. */
        if (cpu == NR_CPUS)
@@ -193,7 +193,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
                wake_up_process(p);
                wait_for_completion(&smdata.done);
        }
-        up(&stopmachine_mutex);
+        mutex_unlock(&stopmachine_mutex);
        return p;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index d1fe71eb4546..a626116af5db 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -315,7 +315,7 @@ static void kernel_kexec(void)
 #endif
 }
-void kernel_shutdown_prepare(enum system_states state)
+static void kernel_shutdown_prepare(enum system_states state)
 {
        blocking_notifier_call_chain(&reboot_notifier_list,
                (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
@@ -916,8 +916,8 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 {
        struct task_struct *p;
        struct task_struct *group_leader = current->group_leader;
-        int err = -EINVAL;
+        struct pid *pgrp;
-        struct pid_namespace *ns;
+        int err;
        if (!pid)
                pid = task_pid_vnr(group_leader);
@@ -929,12 +929,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
        /* From this point forward we keep holding onto the tasklist lock
         * so that our parent does not change from under us. -DaveM
         */
-        ns = current->nsproxy->pid_ns;
        write_lock_irq(&tasklist_lock);
        err = -ESRCH;
-        p = find_task_by_pid_ns(pid, ns);
+        p = find_task_by_vpid(pid);
        if (!p)
                goto out;
@@ -942,7 +940,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
        if (!thread_group_leader(p))
                goto out;
-        if (p->real_parent->tgid == group_leader->tgid) {
+        if (same_thread_group(p->real_parent, group_leader)) {
                err = -EPERM;
                if (task_session(p) != task_session(group_leader))
                        goto out;
@@ -959,10 +957,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
        if (p->signal->leader)
                goto out;
+        pgrp = task_pid(p);
        if (pgid != pid) {
                struct task_struct *g;
-                g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns);
+                pgrp = find_vpid(pgid);
+                g = pid_task(pgrp, PIDTYPE_PGID);
                if (!g || task_session(g) != task_session(group_leader))
                        goto out;
        }
@@ -971,13 +971,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
        if (err)
                goto out;
-        if (task_pgrp_nr_ns(p, ns) != pgid) {
+        if (task_pgrp(p) != pgrp) {
-                struct pid *pid;
                detach_pid(p, PIDTYPE_PGID);
-                pid = find_vpid(pgid);
+                attach_pid(p, PIDTYPE_PGID, pgrp);
-                attach_pid(p, PIDTYPE_PGID, pid);
+                set_task_pgrp(p, pid_nr(pgrp));
-                set_task_pgrp(p, pid_nr(pid));
        }
        err = 0;
@@ -994,17 +991,14 @@ asmlinkage long sys_getpgid(pid_t pid)
        else {
                int retval;
                struct task_struct *p;
-                struct pid_namespace *ns;
-                ns = current->nsproxy->pid_ns;
                read_lock(&tasklist_lock);
-                p = find_task_by_pid_ns(pid, ns);
+                p = find_task_by_vpid(pid);
                retval = -ESRCH;
                if (p) {
                        retval = security_task_getpgid(p);
                        if (!retval)
-                                retval = task_pgrp_nr_ns(p, ns);
+                                retval = task_pgrp_vnr(p);
                }
                read_unlock(&tasklist_lock);
                return retval;
@@ -1028,19 +1022,16 @@ asmlinkage long sys_getsid(pid_t pid)
        else {
                int retval;
                struct task_struct *p;
-                struct pid_namespace *ns;
-                ns = current->nsproxy->pid_ns;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
-                p = find_task_by_pid_ns(pid, ns);
+                p = find_task_by_vpid(pid);
                retval = -ESRCH;
                if (p) {
                        retval = security_task_getsid(p);
                        if (!retval)
-                                retval = task_session_nr_ns(p, ns);
+                                retval = task_session_vnr(p);
                }
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                return retval;
        }
 }
@@ -1048,35 +1039,29 @@ asmlinkage long sys_getsid(pid_t pid)
 asmlinkage long sys_setsid(void)
 {
        struct task_struct *group_leader = current->group_leader;
-        pid_t session;
+        struct pid *sid = task_pid(group_leader);
+        pid_t session = pid_vnr(sid);
        int err = -EPERM;
        write_lock_irq(&tasklist_lock);
        /* Fail if I am already a session leader */
        if (group_leader->signal->leader)
                goto out;
-        session = group_leader->pid;
        /* Fail if a process group id already exists that equals the
         * proposed session id.
-         *
-         * Don't check if session id == 1 because kernel threads use this
-         * session id and so the check will always fail and make it so
-         * init cannot successfully call setsid.
         */
-        if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID,
+        if (pid_task(sid, PIDTYPE_PGID))
-                                session, &init_pid_ns))
                goto out;
        group_leader->signal->leader = 1;
-        __set_special_pids(session, session);
+        __set_special_pids(sid);
        spin_lock(&group_leader->sighand->siglock);
        group_leader->signal->tty = NULL;
        spin_unlock(&group_leader->sighand->siglock);
-        err = task_pgrp_vnr(group_leader);
+        err = session;
 out:
        write_unlock_irq(&tasklist_lock);
        return err;
@@ -1145,16 +1130,16 @@ static int groups_to_user(gid_t __user *grouplist,
    struct group_info *group_info)
 {
        int i;
-        int count = group_info->ngroups;
+        unsigned int count = group_info->ngroups;
        for (i = 0; i < group_info->nblocks; i++) {
-                int cp_count = min(NGROUPS_PER_BLOCK, count);
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                int off = i * NGROUPS_PER_BLOCK;
+                unsigned int len = cp_count * sizeof(*grouplist);
-                int len = cp_count * sizeof(*grouplist);
-                if (copy_to_user(grouplist+off, group_info->blocks[i], len))
+                if (copy_to_user(grouplist, group_info->blocks[i], len))
                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
                count -= cp_count;
        }
        return 0;
@@ -1165,16 +1150,16 @@ static int groups_from_user(struct group_info *group_info,
    gid_t __user *grouplist)
 {
        int i;
-        int count = group_info->ngroups;
+        unsigned int count = group_info->ngroups;
        for (i = 0; i < group_info->nblocks; i++) {
-                int cp_count = min(NGROUPS_PER_BLOCK, count);
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                int off = i * NGROUPS_PER_BLOCK;
+                unsigned int len = cp_count * sizeof(*grouplist);
-                int len = cp_count * sizeof(*grouplist);
-                if (copy_from_user(group_info->blocks[i], grouplist+off, len))
+                if (copy_from_user(group_info->blocks[i], grouplist, len))
                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
                count -= cp_count;
        }
        return 0;
@@ -1472,7 +1457,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
        if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
            !capable(CAP_SYS_RESOURCE))
                return -EPERM;
-        if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
+        if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
                return -EPERM;
        retval = security_task_setrlimit(resource, &new_rlim);
@@ -1637,7 +1622,7 @@ asmlinkage long sys_umask(int mask)
        mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
        return mask;
 }
-    
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                          unsigned long arg4, unsigned long arg5)
 {
@@ -1742,6 +1727,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                        error = prctl_set_seccomp(arg2);
                        break;
+                case PR_CAPBSET_READ:
+                        if (!cap_valid(arg2))
+                                return -EINVAL;
+                        return !!cap_raised(current->cap_bset, arg2);
+                case PR_CAPBSET_DROP:
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+                        return cap_prctl_drop(arg2);
+#else
+                        return -EINVAL;
+#endif
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index beee5b3b68a2..5b9b467de070 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -154,7 +154,10 @@ cond_syscall(sys_ioprio_get);
 /* New file descriptors */
 cond_syscall(sys_signalfd);
-cond_syscall(sys_timerfd);
 cond_syscall(compat_sys_signalfd);
-cond_syscall(compat_sys_timerfd);
+cond_syscall(sys_timerfd_create);
+cond_syscall(sys_timerfd_settime);
+cond_syscall(sys_timerfd_gettime);
+cond_syscall(compat_sys_timerfd_settime);
+cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7cb1ac3e6fff..8b7e95411795 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,7 +37,6 @@
 #include <linux/highuid.h>
 #include <linux/writeback.h>
 #include <linux/hugetlb.h>
-#include <linux/security.h>
 #include <linux/initrd.h>
 #include <linux/times.h>
 #include <linux/limits.h>
@@ -67,14 +66,13 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
+extern int sysctl_oom_dump_tasks;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
 extern int pid_max;
 extern int min_free_kbytes;
-extern int printk_ratelimit_jiffies;
-extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -84,8 +82,11 @@ extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 /* Constants used for minimum and  maximum */
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
 static int one = 1;
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
 #endif
@@ -310,22 +311,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-        {
-                .ctl_name       = CTL_UNNUMBERED,
-                .procname       = "sched_rt_period_ms",
-                .data           = &sysctl_sched_rt_period,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = CTL_UNNUMBERED,
-                .procname       = "sched_rt_ratio",
-                .data           = &sysctl_sched_rt_ratio,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -347,6 +332,22 @@ static struct ctl_table kern_table[] = {
 #endif
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_rt_period_us",
+                .data           = &sysctl_sched_rt_period,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_rt_runtime_us",
+                .data           = &sysctl_sched_rt_runtime,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_compat_yield",
                .data           = &sysctl_sched_compat_yield,
                .maxlen         = sizeof(unsigned int),
@@ -416,15 +417,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
-#ifdef CONFIG_SECURITY_CAPABILITIES
-        {
-                .procname       = "cap-bound",
-                .data           = &cap_bset,
-                .maxlen         = sizeof(kernel_cap_t),
-                .mode           = 0600,
-                .proc_handler   = &proc_dointvec_bset,
-        },
-#endif /* def CONFIG_SECURITY_CAPABILITIES */
 #ifdef CONFIG_BLK_DEV_INITRD
        {
                .ctl_name       = KERN_REALROOTDEV,
@@ -496,14 +488,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-        {
-                .ctl_name       = KERN_PRINTK,
-                .procname       = "printk",
-                .data           = &console_loglevel,
-                .maxlen         = 4*sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
 #ifdef CONFIG_KMOD
        {
                .ctl_name       = KERN_MODPROBE,
@@ -650,6 +634,15 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+#if defined CONFIG_PRINTK
+        {
+                .ctl_name       = KERN_PRINTK,
+                .procname       = "printk",
+                .data           = &console_loglevel,
+                .maxlen         = 4*sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT,
                .procname       = "printk_ratelimit",
@@ -667,6 +660,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+#endif
        {
                .ctl_name       = KERN_NGROUPS_MAX,
                .procname       = "ngroups_max",
@@ -877,6 +871,14 @@ static struct ctl_table vm_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "oom_dump_tasks",
+                .data           = &sysctl_oom_dump_tasks,
+                .maxlen         = sizeof(sysctl_oom_dump_tasks),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = VM_OVERCOMMIT_RATIO,
                .procname       = "overcommit_ratio",
                .data           = &sysctl_overcommit_ratio,
@@ -976,10 +978,10 @@ static struct ctl_table vm_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nr_overcommit_hugepages",
-                .data           = &nr_overcommit_huge_pages,
+                .data           = &sysctl_overcommit_huge_pages,
-                .maxlen         = sizeof(nr_overcommit_huge_pages),
+                .maxlen         = sizeof(sysctl_overcommit_huge_pages),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = &hugetlb_overcommit_handler,
        },
 #endif
        {
@@ -1150,6 +1152,19 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
 #endif
+#ifdef CONFIG_HIGHMEM
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "highmem_is_dirtyable",
+                .data           = &vm_highmem_is_dirtyable,
+                .maxlen         = sizeof(vm_highmem_is_dirtyable),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1196,6 +1211,14 @@ static struct ctl_table fs_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "nr_open",
+                .data           = &sysctl_nr_open,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = FS_DENTRY,
                .procname       = "dentry-state",
                .data           = &dentry_stat,
@@ -2080,26 +2103,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
        return 0;
 }
-#ifdef CONFIG_SECURITY_CAPABILITIES
-/*
- *      init may raise the set.
- */
-int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
-                        void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        int op;
-        if (write && !capable(CAP_SYS_MODULE)) {
-                return -EPERM;
-        }
-        op = is_global_init(current) ? OP_SET : OP_AND;
-        return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
-                                do_proc_dointvec_bset_conv,&op);
-}
-#endif /* def CONFIG_SECURITY_CAPABILITIES */
 /*
 *      Taint values can only be increased
 */
@@ -2484,7 +2487,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
        pid_t tmp;
        int r;
-        tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns);
+        tmp = pid_vnr(cad_pid);
        r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
                               lenp, ppos, NULL, NULL);
@@ -2513,12 +2516,6 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
        return -ENOSYS;
 }
-int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
-                        void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
 int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c3206fa50048..c09350d564f2 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -8,10 +8,10 @@
 struct trans_ctl_table {
        int                     ctl_name;
        const char              *procname;
-        struct trans_ctl_table  *child;
+        const struct trans_ctl_table *child;
 };
-static struct trans_ctl_table trans_random_table[] = {
+static const struct trans_ctl_table trans_random_table[] = {
        { RANDOM_POOLSIZE,      "poolsize" },
        { RANDOM_ENTROPY_COUNT, "entropy_avail" },
        { RANDOM_READ_THRESH,   "read_wakeup_threshold" },
@@ -21,13 +21,13 @@ static struct trans_ctl_table trans_random_table[] = {
        {}
 };
-static struct trans_ctl_table trans_pty_table[] = {
+static const struct trans_ctl_table trans_pty_table[] = {
        { PTY_MAX,              "max" },
        { PTY_NR,               "nr" },
        {}
 };
-static struct trans_ctl_table trans_kern_table[] = {
+static const struct trans_ctl_table trans_kern_table[] = {
        { KERN_OSTYPE,                  "ostype" },
        { KERN_OSRELEASE,               "osrelease" },
        /* KERN_OSREV not used */
@@ -37,10 +37,6 @@ static struct trans_ctl_table trans_kern_table[] = {
        { KERN_NODENAME,                "hostname" },
        { KERN_DOMAINNAME,              "domainname" },
-#ifdef CONFIG_SECURITY_CAPABILITIES
-        { KERN_CAP_BSET,                "cap-bound" },
-#endif /* def CONFIG_SECURITY_CAPABILITIES */
        { KERN_PANIC,                   "panic" },
        { KERN_REALROOTDEV,             "real-root-dev" },
@@ -111,7 +107,7 @@ static struct trans_ctl_table trans_kern_table[] = {
        {}
 };
-static struct trans_ctl_table trans_vm_table[] = {
+static const struct trans_ctl_table trans_vm_table[] = {
        { VM_OVERCOMMIT_MEMORY,         "overcommit_memory" },
        { VM_PAGE_CLUSTER,              "page-cluster" },
        { VM_DIRTY_BACKGROUND,          "dirty_background_ratio" },
@@ -143,7 +139,7 @@ static struct trans_ctl_table trans_vm_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_core_table[] = {
+static const struct trans_ctl_table trans_net_core_table[] = {
        { NET_CORE_WMEM_MAX,            "wmem_max" },
        { NET_CORE_RMEM_MAX,            "rmem_max" },
        { NET_CORE_WMEM_DEFAULT,        "wmem_default" },
@@ -169,14 +165,14 @@ static struct trans_ctl_table trans_net_core_table[] = {
        {},
 };
-static struct trans_ctl_table trans_net_unix_table[] = {
+static const struct trans_ctl_table trans_net_unix_table[] = {
        /* NET_UNIX_DESTROY_DELAY unused */
        /* NET_UNIX_DELETE_DELAY unused */
        { NET_UNIX_MAX_DGRAM_QLEN,      "max_dgram_qlen" },
        {}
 };
-static struct trans_ctl_table trans_net_ipv4_route_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
        { NET_IPV4_ROUTE_FLUSH,                 "flush" },
        { NET_IPV4_ROUTE_MIN_DELAY,             "min_delay" },
        { NET_IPV4_ROUTE_MAX_DELAY,             "max_delay" },
@@ -199,7 +195,7 @@ static struct trans_ctl_table trans_net_ipv4_route_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
        { NET_IPV4_CONF_FORWARDING,             "forwarding" },
        { NET_IPV4_CONF_MC_FORWARDING,          "mc_forwarding" },
@@ -226,14 +222,14 @@ static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ipv4_conf_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
        { NET_PROTO_CONF_ALL,           "all",          trans_net_ipv4_conf_vars_table },
        { NET_PROTO_CONF_DEFAULT,       "default",      trans_net_ipv4_conf_vars_table },
        { 0, NULL, trans_net_ipv4_conf_vars_table },
        {}
 };
-static struct trans_ctl_table trans_net_neigh_vars_table[] = {
+static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
        { NET_NEIGH_MCAST_SOLICIT,      "mcast_solicit" },
        { NET_NEIGH_UCAST_SOLICIT,      "ucast_solicit" },
        { NET_NEIGH_APP_SOLICIT,        "app_solicit" },
@@ -255,13 +251,13 @@ static struct trans_ctl_table trans_net_neigh_vars_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_neigh_table[] = {
+static const struct trans_ctl_table trans_net_neigh_table[] = {
        { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
        { 0, NULL, trans_net_neigh_vars_table },
        {}
 };
-static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
        { NET_IPV4_NF_CONNTRACK_MAX,                            "ip_conntrack_max" },
        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,           "ip_conntrack_tcp_timeout_syn_sent" },
@@ -298,7 +294,7 @@ static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ipv4_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_table[] = {
        { NET_IPV4_FORWARD,                     "ip_forward" },
        { NET_IPV4_DYNADDR,                     "ip_dynaddr" },
@@ -397,13 +393,13 @@ static struct trans_ctl_table trans_net_ipv4_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ipx_table[] = {
+static const struct trans_ctl_table trans_net_ipx_table[] = {
        { NET_IPX_PPROP_BROADCASTING,   "ipx_pprop_broadcasting" },
        /* NET_IPX_FORWARDING unused */
        {}
 };
-static struct trans_ctl_table trans_net_atalk_table[] = {
+static const struct trans_ctl_table trans_net_atalk_table[] = {
        { NET_ATALK_AARP_EXPIRY_TIME,           "aarp-expiry-time" },
        { NET_ATALK_AARP_TICK_TIME,             "aarp-tick-time" },
        { NET_ATALK_AARP_RETRANSMIT_LIMIT,      "aarp-retransmit-limit" },
@@ -411,7 +407,7 @@ static struct trans_ctl_table trans_net_atalk_table[] = {
        {},
 };
-static struct trans_ctl_table trans_net_netrom_table[] = {
+static const struct trans_ctl_table trans_net_netrom_table[] = {
        { NET_NETROM_DEFAULT_PATH_QUALITY,              "default_path_quality" },
        { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER,    "obsolescence_count_initialiser" },
        { NET_NETROM_NETWORK_TTL_INITIALISER,           "network_ttl_initialiser" },
@@ -427,7 +423,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ax25_param_table[] = {
+static const struct trans_ctl_table trans_net_ax25_param_table[] = {
        { NET_AX25_IP_DEFAULT_MODE,     "ip_default_mode" },
        { NET_AX25_DEFAULT_MODE,        "ax25_default_mode" },
        { NET_AX25_BACKOFF_TYPE,        "backoff_type" },
@@ -445,12 +441,12 @@ static struct trans_ctl_table trans_net_ax25_param_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ax25_table[] = {
+static const struct trans_ctl_table trans_net_ax25_table[] = {
        { 0, NULL, trans_net_ax25_param_table },
        {}
 };
-static struct trans_ctl_table trans_net_bridge_table[] = {
+static const struct trans_ctl_table trans_net_bridge_table[] = {
        { NET_BRIDGE_NF_CALL_ARPTABLES,         "bridge-nf-call-arptables" },
        { NET_BRIDGE_NF_CALL_IPTABLES,          "bridge-nf-call-iptables" },
        { NET_BRIDGE_NF_CALL_IP6TABLES,         "bridge-nf-call-ip6tables" },
@@ -459,7 +455,7 @@ static struct trans_ctl_table trans_net_bridge_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_rose_table[] = {
+static const struct trans_ctl_table trans_net_rose_table[] = {
        { NET_ROSE_RESTART_REQUEST_TIMEOUT,     "restart_request_timeout" },
        { NET_ROSE_CALL_REQUEST_TIMEOUT,        "call_request_timeout" },
        { NET_ROSE_RESET_REQUEST_TIMEOUT,       "reset_request_timeout" },
@@ -473,7 +469,7 @@ static struct trans_ctl_table trans_net_rose_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
        { NET_IPV6_FORWARDING,                  "forwarding" },
        { NET_IPV6_HOP_LIMIT,                   "hop_limit" },
        { NET_IPV6_MTU,                         "mtu" },
@@ -501,14 +497,14 @@ static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ipv6_conf_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
        { NET_PROTO_CONF_ALL,           "all",  trans_net_ipv6_conf_var_table },
        { NET_PROTO_CONF_DEFAULT,       "default", trans_net_ipv6_conf_var_table },
        { 0, NULL, trans_net_ipv6_conf_var_table },
        {}
 };
-static struct trans_ctl_table trans_net_ipv6_route_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
        { NET_IPV6_ROUTE_FLUSH,                 "flush" },
        { NET_IPV6_ROUTE_GC_THRESH,             "gc_thresh" },
        { NET_IPV6_ROUTE_MAX_SIZE,              "max_size" },
@@ -522,12 +518,12 @@ static struct trans_ctl_table trans_net_ipv6_route_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
        { NET_IPV6_ICMP_RATELIMIT,      "ratelimit" },
        {}
 };
-static struct trans_ctl_table trans_net_ipv6_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_table[] = {
        { NET_IPV6_CONF,                "conf",         trans_net_ipv6_conf_table },
        { NET_IPV6_NEIGH,               "neigh",        trans_net_neigh_table },
        { NET_IPV6_ROUTE,               "route",        trans_net_ipv6_route_table },
@@ -542,7 +538,7 @@ static struct trans_ctl_table trans_net_ipv6_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_x25_table[] = {
+static const struct trans_ctl_table trans_net_x25_table[] = {
        { NET_X25_RESTART_REQUEST_TIMEOUT,      "restart_request_timeout" },
        { NET_X25_CALL_REQUEST_TIMEOUT,         "call_request_timeout" },
        { NET_X25_RESET_REQUEST_TIMEOUT,        "reset_request_timeout" },
@@ -552,13 +548,13 @@ static struct trans_ctl_table trans_net_x25_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_tr_table[] = {
+static const struct trans_ctl_table trans_net_tr_table[] = {
        { NET_TR_RIF_TIMEOUT,   "rif_timeout" },
        {}
 };
-static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
+static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
        { NET_DECNET_CONF_DEV_FORWARDING,       "forwarding" },
        { NET_DECNET_CONF_DEV_PRIORITY,         "priority" },
        { NET_DECNET_CONF_DEV_T2,               "t2" },
@@ -566,12 +562,12 @@ static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
        {}
 };
-static struct trans_ctl_table trans_net_decnet_conf[] = {
+static const struct trans_ctl_table trans_net_decnet_conf[] = {
        { 0, NULL, trans_net_decnet_conf_vars },
        {}
 };
-static struct trans_ctl_table trans_net_decnet_table[] = {
+static const struct trans_ctl_table trans_net_decnet_table[] = {
        { NET_DECNET_CONF,              "conf", trans_net_decnet_conf },
        { NET_DECNET_NODE_ADDRESS,      "node_address" },
        { NET_DECNET_NODE_NAME,         "node_name" },
@@ -589,7 +585,7 @@ static struct trans_ctl_table trans_net_decnet_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_sctp_table[] = {
+static const struct trans_ctl_table trans_net_sctp_table[] = {
        { NET_SCTP_RTO_INITIAL,         "rto_initial" },
        { NET_SCTP_RTO_MIN,             "rto_min" },
        { NET_SCTP_RTO_MAX,             "rto_max" },
@@ -610,7 +606,7 @@ static struct trans_ctl_table trans_net_sctp_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
+static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
        { NET_LLC2_ACK_TIMEOUT,         "ack" },
        { NET_LLC2_P_TIMEOUT,           "p" },
        { NET_LLC2_REJ_TIMEOUT,         "rej" },
@@ -618,23 +614,23 @@ static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_llc_station_table[] = {
+static const struct trans_ctl_table trans_net_llc_station_table[] = {
        { NET_LLC_STATION_ACK_TIMEOUT,  "ack_timeout" },
        {}
 };
-static struct trans_ctl_table trans_net_llc_llc2_table[] = {
+static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
        { NET_LLC2,             "timeout",      trans_net_llc_llc2_timeout_table },
        {}
 };
-static struct trans_ctl_table trans_net_llc_table[] = {
+static const struct trans_ctl_table trans_net_llc_table[] = {
        { NET_LLC2,             "llc2",         trans_net_llc_llc2_table },
        { NET_LLC_STATION,      "station",      trans_net_llc_station_table },
        {}
 };
-static struct trans_ctl_table trans_net_netfilter_table[] = {
+static const struct trans_ctl_table trans_net_netfilter_table[] = {
        { NET_NF_CONNTRACK_MAX,                         "nf_conntrack_max" },
        { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,        "nf_conntrack_tcp_timeout_syn_sent" },
        { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,        "nf_conntrack_tcp_timeout_syn_recv" },
@@ -671,12 +667,12 @@ static struct trans_ctl_table trans_net_netfilter_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_dccp_table[] = {
+static const struct trans_ctl_table trans_net_dccp_table[] = {
        { NET_DCCP_DEFAULT,     "default" },
        {}
 };
-static struct trans_ctl_table trans_net_irda_table[] = {
+static const struct trans_ctl_table trans_net_irda_table[] = {
        { NET_IRDA_DISCOVERY,           "discovery" },
        { NET_IRDA_DEVNAME,             "devname" },
        { NET_IRDA_DEBUG,               "debug" },
@@ -694,7 +690,7 @@ static struct trans_ctl_table trans_net_irda_table[] = {
        {}
 };
-static struct trans_ctl_table trans_net_table[] = {
+static const struct trans_ctl_table trans_net_table[] = {
        { NET_CORE,             "core",         trans_net_core_table },
        /* NET_ETHER not used */
        /* NET_802 not used */
@@ -720,7 +716,7 @@ static struct trans_ctl_table trans_net_table[] = {
        {}
 };
-static struct trans_ctl_table trans_fs_quota_table[] = {
+static const struct trans_ctl_table trans_fs_quota_table[] = {
        { FS_DQ_LOOKUPS,        "lookups" },
        { FS_DQ_DROPS,          "drops" },
        { FS_DQ_READS,          "reads" },
@@ -733,7 +729,7 @@ static struct trans_ctl_table trans_fs_quota_table[] = {
        {}
 };
-static struct trans_ctl_table trans_fs_xfs_table[] = {
+static const struct trans_ctl_table trans_fs_xfs_table[] = {
        { XFS_RESTRICT_CHOWN,   "restrict_chown" },
        { XFS_SGID_INHERIT,     "irix_sgid_inherit" },
        { XFS_SYMLINK_MODE,     "irix_symlink_mode" },
@@ -754,24 +750,24 @@ static struct trans_ctl_table trans_fs_xfs_table[] = {
        {}
 };
-static struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
+static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
        { 1, "hb_ctl_path" },
        {}
 };
-static struct trans_ctl_table trans_fs_ocfs2_table[] = {
+static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
        { 1,    "nm",   trans_fs_ocfs2_nm_table },
        {}
 };
-static struct trans_ctl_table trans_inotify_table[] = {
+static const struct trans_ctl_table trans_inotify_table[] = {
        { INOTIFY_MAX_USER_INSTANCES,   "max_user_instances" },
        { INOTIFY_MAX_USER_WATCHES,     "max_user_watches" },
        { INOTIFY_MAX_QUEUED_EVENTS,    "max_queued_events" },
        {}
 };
-static struct trans_ctl_table trans_fs_table[] = {
+static const struct trans_ctl_table trans_fs_table[] = {
        { FS_NRINODE,           "inode-nr" },
        { FS_STATINODE,         "inode-state" },
        /* FS_MAXINODE unused */
@@ -797,11 +793,11 @@ static struct trans_ctl_table trans_fs_table[] = {
        {}
 };
-static struct trans_ctl_table trans_debug_table[] = {
+static const struct trans_ctl_table trans_debug_table[] = {
        {}
 };
-static struct trans_ctl_table trans_cdrom_table[] = {
+static const struct trans_ctl_table trans_cdrom_table[] = {
        { DEV_CDROM_INFO,               "info" },
        { DEV_CDROM_AUTOCLOSE,          "autoclose" },
        { DEV_CDROM_AUTOEJECT,          "autoeject" },
@@ -811,12 +807,12 @@ static struct trans_ctl_table trans_cdrom_table[] = {
        {}
 };
-static struct trans_ctl_table trans_ipmi_table[] = {
+static const struct trans_ctl_table trans_ipmi_table[] = {
        { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
        {}
 };
-static struct trans_ctl_table trans_mac_hid_files[] = {
+static const struct trans_ctl_table trans_mac_hid_files[] = {
        /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
        /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
        { DEV_MAC_HID_MOUSE_BUTTON_EMULATION,   "mouse_button_emulation" },
@@ -826,35 +822,35 @@ static struct trans_ctl_table trans_mac_hid_files[] = {
        {}
 };
-static struct trans_ctl_table trans_raid_table[] = {
+static const struct trans_ctl_table trans_raid_table[] = {
        { DEV_RAID_SPEED_LIMIT_MIN,     "speed_limit_min" },
        { DEV_RAID_SPEED_LIMIT_MAX,     "speed_limit_max" },
        {}
 };
-static struct trans_ctl_table trans_scsi_table[] = {
+static const struct trans_ctl_table trans_scsi_table[] = {
        { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
        {}
 };
-static struct trans_ctl_table trans_parport_default_table[] = {
+static const struct trans_ctl_table trans_parport_default_table[] = {
        { DEV_PARPORT_DEFAULT_TIMESLICE,        "timeslice" },
        { DEV_PARPORT_DEFAULT_SPINTIME,         "spintime" },
        {}
 };
-static struct trans_ctl_table trans_parport_device_table[] = {
+static const struct trans_ctl_table trans_parport_device_table[] = {
        { DEV_PARPORT_DEVICE_TIMESLICE,         "timeslice" },
        {}
 };
-static struct trans_ctl_table trans_parport_devices_table[] = {
+static const struct trans_ctl_table trans_parport_devices_table[] = {
        { DEV_PARPORT_DEVICES_ACTIVE,           "active" },
        { 0, NULL, trans_parport_device_table },
        {}
 };
-static struct trans_ctl_table trans_parport_parport_table[] = {
+static const struct trans_ctl_table trans_parport_parport_table[] = {
        { DEV_PARPORT_SPINTIME,         "spintime" },
        { DEV_PARPORT_BASE_ADDR,        "base-addr" },
        { DEV_PARPORT_IRQ,              "irq" },
@@ -868,13 +864,13 @@ static struct trans_ctl_table trans_parport_parport_table[] = {
        { DEV_PARPORT_AUTOPROBE + 4,    "autoprobe3" },
        {}
 };
-static struct trans_ctl_table trans_parport_table[] = {
+static const struct trans_ctl_table trans_parport_table[] = {
        { DEV_PARPORT_DEFAULT,  "default",      trans_parport_default_table },
        { 0, NULL, trans_parport_parport_table },
        {}
 };
-static struct trans_ctl_table trans_dev_table[] = {
+static const struct trans_ctl_table trans_dev_table[] = {
        { DEV_CDROM,    "cdrom",        trans_cdrom_table },
        /* DEV_HWMON unused */
        { DEV_PARPORT,  "parport",      trans_parport_table },
@@ -885,19 +881,19 @@ static struct trans_ctl_table trans_dev_table[] = {
        {}
 };
-static struct trans_ctl_table trans_bus_isa_table[] = {
+static const struct trans_ctl_table trans_bus_isa_table[] = {
        { BUS_ISA_MEM_BASE,     "membase" },
        { BUS_ISA_PORT_BASE,    "portbase" },
        { BUS_ISA_PORT_SHIFT,   "portshift" },
        {}
 };
-static struct trans_ctl_table trans_bus_table[] = {
+static const struct trans_ctl_table trans_bus_table[] = {
        { CTL_BUS_ISA,  "isa",  trans_bus_isa_table },
        {}
 };
-static struct trans_ctl_table trans_arlan_conf_table0[] = {
+static const struct trans_ctl_table trans_arlan_conf_table0[] = {
        { 1,    "spreadingCode" },
        { 2,    "channelNumber" },
        { 3,    "scramblingDisable" },
@@ -968,7 +964,7 @@ static struct trans_ctl_table trans_arlan_conf_table0[] = {
        {}
 };
-static struct trans_ctl_table trans_arlan_conf_table1[] = {
+static const struct trans_ctl_table trans_arlan_conf_table1[] = {
        { 1,    "spreadingCode" },
        { 2,    "channelNumber" },
        { 3,    "scramblingDisable" },
@@ -1039,7 +1035,7 @@ static struct trans_ctl_table trans_arlan_conf_table1[] = {
        {}
 };
-static struct trans_ctl_table trans_arlan_conf_table2[] = {
+static const struct trans_ctl_table trans_arlan_conf_table2[] = {
        { 1,    "spreadingCode" },
        { 2,    "channelNumber" },
        { 3,    "scramblingDisable" },
@@ -1110,7 +1106,7 @@ static struct trans_ctl_table trans_arlan_conf_table2[] = {
        {}
 };
-static struct trans_ctl_table trans_arlan_conf_table3[] = {
+static const struct trans_ctl_table trans_arlan_conf_table3[] = {
        { 1,    "spreadingCode" },
        { 2,    "channelNumber" },
        { 3,    "scramblingDisable" },
@@ -1181,7 +1177,7 @@ static struct trans_ctl_table trans_arlan_conf_table3[] = {
        {}
 };
-static struct trans_ctl_table trans_arlan_table[] = {
+static const struct trans_ctl_table trans_arlan_table[] = {
        { 1,            "arlan0",       trans_arlan_conf_table0 },
        { 2,            "arlan1",       trans_arlan_conf_table1 },
        { 3,            "arlan2",       trans_arlan_conf_table2 },
@@ -1189,13 +1185,13 @@ static struct trans_ctl_table trans_arlan_table[] = {
        {}
 };
-static struct trans_ctl_table trans_s390dbf_table[] = {
+static const struct trans_ctl_table trans_s390dbf_table[] = {
        { 5678 /* CTL_S390DBF_STOPPABLE */,     "debug_stoppable" },
        { 5679 /* CTL_S390DBF_ACTIVE */,        "debug_active" },
        {}
 };
-static struct trans_ctl_table trans_sunrpc_table[] = {
+static const struct trans_ctl_table trans_sunrpc_table[] = {
        { CTL_RPCDEBUG,         "rpc_debug" },
        { CTL_NFSDEBUG,         "nfs_debug" },
        { CTL_NFSDDEBUG,        "nfsd_debug" },
@@ -1207,7 +1203,7 @@ static struct trans_ctl_table trans_sunrpc_table[] = {
        {}
 };
-static struct trans_ctl_table trans_pm_table[] = {
+static const struct trans_ctl_table trans_pm_table[] = {
        { 1 /* CTL_PM_SUSPEND */,       "suspend" },
        { 2 /* CTL_PM_CMODE */,         "cmode" },
        { 3 /* CTL_PM_P0 */,            "p0" },
@@ -1215,13 +1211,13 @@ static struct trans_ctl_table trans_pm_table[] = {
        {}
 };
-static struct trans_ctl_table trans_frv_table[] = {
+static const struct trans_ctl_table trans_frv_table[] = {
        { 1,    "cache-mode" },
        { 2,    "pin-cxnr" },
        {}
 };
-static struct trans_ctl_table trans_root_table[] = {
+static const struct trans_ctl_table trans_root_table[] = {
        { CTL_KERN,     "kernel",       trans_kern_table },
        { CTL_VM,       "vm",           trans_vm_table },
        { CTL_NET,      "net",          trans_net_table },
@@ -1265,15 +1261,14 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
        return table;
 }
-static struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
+static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
 {
        struct ctl_table *test;
-        struct trans_ctl_table *ref;
+        const struct trans_ctl_table *ref;
-        int depth, cur_depth;
+        int cur_depth;
-        depth = sysctl_depth(table);
+        cur_depth = sysctl_depth(table);
-        cur_depth = depth;
        ref = trans_root_table;
 repeat:
        test = sysctl_parent(table, cur_depth);
@@ -1441,7 +1436,7 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
 static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
 {
-        struct trans_ctl_table *ref;
+        const struct trans_ctl_table *ref;
        ref = sysctl_binary_lookup(table);
        if (table->ctl_name && !ref)
@@ -1498,9 +1493,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                            (table->strategy == sysctl_ms_jiffies) ||
                            (table->proc_handler == proc_dostring) ||
                            (table->proc_handler == proc_dointvec) ||
-#ifdef CONFIG_SECURITY_CAPABILITIES
-                            (table->proc_handler == proc_dointvec_bset) ||
-#endif /* def CONFIG_SECURITY_CAPABILITIES */
                            (table->proc_handler == proc_dointvec_minmax) ||
                            (table->proc_handler == proc_dointvec_jiffies) ||
                            (table->proc_handler == proc_dointvec_userhz_jiffies) ||
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 88cdb109e13c..06b6395b45b2 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -135,6 +135,12 @@ static int test_jprobe(void)
 #ifdef CONFIG_KRETPROBES
 static u32 krph_val;
+static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+        krph_val = (rand1 / div_factor);
+        return 0;
+}
 static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
        unsigned long ret = regs_return_value(regs);
@@ -144,13 +150,19 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
                printk(KERN_ERR "Kprobe smoke test failed: "
                                "incorrect value in kretprobe handler\n");
        }
+        if (krph_val == 0) {
+                handler_errors++;
+                printk(KERN_ERR "Kprobe smoke test failed: "
+                                "call to kretprobe entry handler failed\n");
+        }
-        krph_val = (rand1 / div_factor);
+        krph_val = rand1;
        return 0;
 }
 static struct kretprobe rp = {
        .handler        = return_handler,
+        .entry_handler  = entry_handler,
        .kp.symbol_name = "kprobe_target"
 };
@@ -167,7 +179,7 @@ static int test_kretprobe(void)
        ret = kprobe_target(rand1);
        unregister_kretprobe(&rp);
-        if (krph_val == 0) {
+        if (krph_val != rand1) {
                printk(KERN_ERR "Kprobe smoke test failed: "
                                "kretprobe handler not called\n");
                handler_errors++;
diff --git a/kernel/time.c b/kernel/time.c
index 4064c0566e77..a5ec013b6c80 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -39,6 +39,8 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include "timeconst.h"
 /*
 * The timezone where the local system is located.  Used as a default by some
 * programs who obtain this value by using gettimeofday.
@@ -93,7 +95,8 @@ asmlinkage long sys_stime(time_t __user *tptr)
 #endif /* __ARCH_WANT_SYS_TIME */
-asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
+asmlinkage long sys_gettimeofday(struct timeval __user *tv,
+                                 struct timezone __user *tz)
 {
        if (likely(tv != NULL)) {
                struct timeval ktv;
@@ -118,7 +121,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
 * hard to make the program warp the clock precisely n hours)  or
 * compile in the timezone information into the kernel.  Bad, bad....
 *
- *                                              - TYT, 1992-01-01
+ *                                              - TYT, 1992-01-01
 *
 * The best thing to do is to keep the CMOS clock in universal time (UTC)
 * as real UNIX machines always do it. This avoids all headaches about
@@ -240,7 +243,11 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
 #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
 #else
-        return (j * MSEC_PER_SEC) / HZ;
+# if BITS_PER_LONG == 32
+        return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+# else
+        return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
+# endif
 #endif
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
@@ -252,7 +259,11 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
 #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
        return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
 #else
-        return (j * USEC_PER_SEC) / HZ;
+# if BITS_PER_LONG == 32
+        return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+# else
+        return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
+# endif
 #endif
 }
 EXPORT_SYMBOL(jiffies_to_usecs);
@@ -267,7 +278,7 @@ EXPORT_SYMBOL(jiffies_to_usecs);
 *
 * This function should be only used for timestamps returned by
 * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
- * it doesn't handle the better resolution of the later.
+ * it doesn't handle the better resolution of the latter.
 */
 struct timespec timespec_trunc(struct timespec t, unsigned gran)
 {
@@ -315,7 +326,7 @@ EXPORT_SYMBOL_GPL(getnstimeofday);
 * This algorithm was first published by Gauss (I think).
 *
 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines were long is 32-bit! (However, as time_t is signed, we
+ * machines where long is 32-bit! (However, as time_t is signed, we
 * will already get problems at other places on 2038-01-19 03:14:08)
 */
 unsigned long
@@ -352,7 +363,7 @@ EXPORT_SYMBOL(mktime);
 * normalize to the timespec storage format
 *
 * Note: The tv_nsec part is always in the range of
- *      0 <= tv_nsec < NSEC_PER_SEC
+ *      0 <= tv_nsec < NSEC_PER_SEC
 * For negative values only the tv_sec field is negative !
 */
 void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
@@ -453,12 +464,13 @@ unsigned long msecs_to_jiffies(const unsigned int m)
        /*
         * Generic case - multiply, round and divide. But first
         * check that if we are doing a net multiplication, that
-         * we wouldnt overflow:
+         * we wouldn't overflow:
         */
        if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
-        return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
+        return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+                >> MSEC_TO_HZ_SHR32;
 #endif
 }
 EXPORT_SYMBOL(msecs_to_jiffies);
@@ -472,7 +484,8 @@ unsigned long usecs_to_jiffies(const unsigned int u)
 #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
        return u * (HZ / USEC_PER_SEC);
 #else
-        return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
+        return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+                >> USEC_TO_HZ_SHR32;
 #endif
 }
 EXPORT_SYMBOL(usecs_to_jiffies);
@@ -566,7 +579,11 @@ EXPORT_SYMBOL(jiffies_to_timeval);
 clock_t jiffies_to_clock_t(long x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+        return x * (USER_HZ / HZ);
+# else
        return x / (HZ / USER_HZ);
+# endif
 #else
        u64 tmp = (u64)x * TICK_NSEC;
        do_div(tmp, (NSEC_PER_SEC / USER_HZ));
@@ -599,7 +616,14 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
 u64 jiffies_64_to_clock_t(u64 x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+        x *= USER_HZ;
+        do_div(x, HZ);
+# elif HZ > USER_HZ
        do_div(x, HZ / USER_HZ);
+# else
+        /* Nothing to do */
+# endif
 #else
        /*
         * There are better ways that don't overflow early,
@@ -611,7 +635,6 @@ u64 jiffies_64_to_clock_t(u64 x)
 #endif
        return x;
 }
 EXPORT_SYMBOL(jiffies_64_to_clock_t);
 u64 nsec_to_clock_t(u64 x)
@@ -646,7 +669,6 @@ u64 get_jiffies_64(void)
        } while (read_seqretry(&xtime_lock, seq));
        return ret;
 }
 EXPORT_SYMBOL(get_jiffies_64);
 #endif
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3e59fce6dd43..3d1e3e1a1971 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -133,7 +133,7 @@ static void clockevents_do_notify(unsigned long reason, void *dev)
 }
 /*
- * Called after a notify add to make devices availble which were
+ * Called after a notify add to make devices available which were
 * released from the notifier call.
 */
 static void clockevents_notify_released(void)
@@ -218,6 +218,8 @@ void clockevents_exchange_device(struct clock_event_device *old,
 */
 void clockevents_notify(unsigned long reason, void *arg)
 {
+        struct list_head *node, *tmp;
        spin_lock(&clockevents_lock);
        clockevents_do_notify(reason, arg);
@@ -227,13 +229,8 @@ void clockevents_notify(unsigned long reason, void *arg)
                 * Unregister the clock event devices which were
                 * released from the users in the notify chain.
                 */
-                while (!list_empty(&clockevents_released)) {
+                list_for_each_safe(node, tmp, &clockevents_released)
-                        struct clock_event_device *dev;
+                        list_del(node);
-                        dev = list_entry(clockevents_released.next,
-                                         struct clock_event_device, list);
-                        list_del(&dev->list);
-                }
                break;
        default:
                break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6e9259a5d501..548c436a776b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -91,7 +91,6 @@ static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
               cs->name, delta);
        cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
        clocksource_change_rating(cs, 0);
-        cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
        list_del(&cs->wd_list);
 }
@@ -363,15 +362,13 @@ void clocksource_unregister(struct clocksource *cs)
 static ssize_t
 sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
 {
-        char *curr = buf;
+        ssize_t count = 0;
        spin_lock_irq(&clocksource_lock);
-        curr += sprintf(curr, "%s ", curr_clocksource->name);
+        count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
        spin_unlock_irq(&clocksource_lock);
-        curr += sprintf(curr, "\n");
+        return count;
-        return curr - buf;
 }
 /**
@@ -439,17 +436,20 @@ static ssize_t
 sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 {
        struct clocksource *src;
-        char *curr = buf;
+        ssize_t count = 0;
        spin_lock_irq(&clocksource_lock);
        list_for_each_entry(src, &clocksource_list, list) {
-                curr += sprintf(curr, "%s ", src->name);
+                count += snprintf(buf + count,
+                                  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
+                                  "%s ", src->name);
        }
        spin_unlock_irq(&clocksource_lock);
-        curr += sprintf(curr, "\n");
+        count += snprintf(buf + count,
+                          max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
-        return curr - buf;
+        return count;
 }
 /*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index e64efaf957e8..c88b5910e7ab 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -43,10 +43,6 @@ long time_freq;				/* frequency offset (scaled ppm)*/
 static long time_reftime;               /* time at last adjustment (s)  */
 long time_adjust;
-#define CLOCK_TICK_OVERFLOW     (LATCH * HZ - CLOCK_TICK_RATE)
-#define CLOCK_TICK_ADJUST       (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
-                                        (s64)CLOCK_TICK_RATE)
 static void ntp_update_frequency(void)
 {
        u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 88267f0a8471..fa9bb73dbdb4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -681,7 +681,7 @@ int tick_check_oneshot_change(int allow_nohz)
        if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
                return 0;
-        if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
+        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
                return 0;
        if (!allow_nohz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cd5dbc4579c9..1af9fb050fe2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -201,9 +201,9 @@ static inline s64 __get_nsec_offset(void) { return 0; }
 #endif
 /**
- * timekeeping_is_continuous - check to see if timekeeping is free running
+ * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
-int timekeeping_is_continuous(void)
+int timekeeping_valid_for_hres(void)
 {
        unsigned long seq;
        int ret;
@@ -364,7 +364,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
         * with losing too many ticks, otherwise we would overadjust and
         * produce an even larger error.  The smaller the adjustment the
         * faster we try to adjust for it, as lost ticks can do less harm
-         * here.  This is tuned so that an error of about 1 msec is adusted
+         * here.  This is tuned so that an error of about 1 msec is adjusted
         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
         */
        error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
new file mode 100644
index 000000000000..41468035473c
--- /dev/null
+++ b/kernel/timeconst.pl
@@ -0,0 +1,402 @@
+#!/usr/bin/perl
+# -----------------------------------------------------------------------
+#
+#   Copyright 2007 rPath, Inc. - All Rights Reserved
+#
+#   This file is part of the Linux kernel, and is made available under
+#   the terms of the GNU General Public License version 2 or (at your
+#   option) any later version; incorporated herein by reference.
+#
+# -----------------------------------------------------------------------
+#
+#
+# Usage: timeconst.pl HZ > timeconst.h
+#
+# Precomputed values for systems without Math::BigInt
+# Generated by:
+# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
+%canned_values = (
+        24 => [
+                '0xa6aaaaab','0x2aaaaaa',26,
+                '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
+                125,3,
+                '0xc49ba5e4','0x1fbe76c8b4',37,
+                '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
+                3,125,
+                '0xa2c2aaab','0xaaaa',16,
+                '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
+                125000,3,
+                '0xc9539b89','0x7fffbce4217d',47,
+                '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
+                3,125000,
+        ], 32 => [
+                '0xfa000000','0x6000000',27,
+                '0xfa00000000000000','0x600000000000000',59,
+                125,4,
+                '0x83126e98','0xfdf3b645a',36,
+                '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
+                4,125,
+                '0xf4240000','0x0',17,
+                '0xf424000000000000','0x0',49,
+                31250,1,
+                '0x8637bd06','0x3fff79c842fa',46,
+                '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
+                1,31250,
+        ], 48 => [
+                '0xa6aaaaab','0x6aaaaaa',27,
+                '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
+                125,6,
+                '0xc49ba5e4','0xfdf3b645a',36,
+                '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
+                6,125,
+                '0xa2c2aaab','0x15555',17,
+                '0xa2c2aaaaaaaaaaab','0x1555555555555',49,
+                62500,3,
+                '0xc9539b89','0x3fffbce4217d',46,
+                '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
+                3,62500,
+        ], 64 => [
+                '0xfa000000','0xe000000',28,
+                '0xfa00000000000000','0xe00000000000000',60,
+                125,8,
+                '0x83126e98','0x7ef9db22d',35,
+                '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
+                8,125,
+                '0xf4240000','0x0',18,
+                '0xf424000000000000','0x0',50,
+                15625,1,
+                '0x8637bd06','0x1fff79c842fa',45,
+                '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
+                1,15625,
+        ], 100 => [
+                '0xa0000000','0x0',28,
+                '0xa000000000000000','0x0',60,
+                10,1,
+                '0xcccccccd','0x733333333',35,
+                '0xcccccccccccccccd','0x73333333333333333',67,
+                1,10,
+                '0x9c400000','0x0',18,
+                '0x9c40000000000000','0x0',50,
+                10000,1,
+                '0xd1b71759','0x1fff2e48e8a7',45,
+                '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
+                1,10000,
+        ], 122 => [
+                '0x8325c53f','0xfbcda3a',28,
+                '0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
+                500,61,
+                '0xf9db22d1','0x7fbe76c8b',35,
+                '0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
+                61,500,
+                '0x8012e2a0','0x3ef36',18,
+                '0x8012e29f79b47583','0x3ef368eb04325',50,
+                500000,61,
+                '0xffda4053','0x1ffffbce4217',45,
+                '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
+                61,500000,
+        ], 128 => [
+                '0xfa000000','0x1e000000',29,
+                '0xfa00000000000000','0x1e00000000000000',61,
+                125,16,
+                '0x83126e98','0x3f7ced916',34,
+                '0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
+                16,125,
+                '0xf4240000','0x40000',19,
+                '0xf424000000000000','0x4000000000000',51,
+                15625,2,
+                '0x8637bd06','0xfffbce4217d',44,
+                '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
+                2,15625,
+        ], 200 => [
+                '0xa0000000','0x0',29,
+                '0xa000000000000000','0x0',61,
+                5,1,
+                '0xcccccccd','0x333333333',34,
+                '0xcccccccccccccccd','0x33333333333333333',66,
+                1,5,
+                '0x9c400000','0x0',19,
+                '0x9c40000000000000','0x0',51,
+                5000,1,
+                '0xd1b71759','0xfff2e48e8a7',44,
+                '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
+                1,5000,
+        ], 250 => [
+                '0x80000000','0x0',29,
+                '0x8000000000000000','0x0',61,
+                4,1,
+                '0x80000000','0x180000000',33,
+                '0x8000000000000000','0x18000000000000000',65,
+                1,4,
+                '0xfa000000','0x0',20,
+                '0xfa00000000000000','0x0',52,
+                4000,1,
+                '0x83126e98','0x7ff7ced9168',43,
+                '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
+                1,4000,
+        ], 256 => [
+                '0xfa000000','0x3e000000',30,
+                '0xfa00000000000000','0x3e00000000000000',62,
+                125,32,
+                '0x83126e98','0x1fbe76c8b',33,
+                '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
+                32,125,
+                '0xf4240000','0xc0000',20,
+                '0xf424000000000000','0xc000000000000',52,
+                15625,4,
+                '0x8637bd06','0x7ffde7210be',43,
+                '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
+                4,15625,
+        ], 300 => [
+                '0xd5555556','0x2aaaaaaa',30,
+                '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
+                10,3,
+                '0x9999999a','0x1cccccccc',33,
+                '0x999999999999999a','0x1cccccccccccccccc',65,
+                3,10,
+                '0xd0555556','0xaaaaa',20,
+                '0xd055555555555556','0xaaaaaaaaaaaaa',52,
+                10000,3,
+                '0x9d495183','0x7ffcb923a29',43,
+                '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
+                3,10000,
+        ], 512 => [
+                '0xfa000000','0x7e000000',31,
+                '0xfa00000000000000','0x7e00000000000000',63,
+                125,64,
+                '0x83126e98','0xfdf3b645',32,
+                '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
+                64,125,
+                '0xf4240000','0x1c0000',21,
+                '0xf424000000000000','0x1c000000000000',53,
+                15625,8,
+                '0x8637bd06','0x3ffef39085f',42,
+                '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
+                8,15625,
+        ], 1000 => [
+                '0x80000000','0x0',31,
+                '0x8000000000000000','0x0',63,
+                1,1,
+                '0x80000000','0x0',31,
+                '0x8000000000000000','0x0',63,
+                1,1,
+                '0xfa000000','0x0',22,
+                '0xfa00000000000000','0x0',54,
+                1000,1,
+                '0x83126e98','0x1ff7ced9168',41,
+                '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
+                1,1000,
+        ], 1024 => [
+                '0xfa000000','0xfe000000',32,
+                '0xfa00000000000000','0xfe00000000000000',64,
+                125,128,
+                '0x83126e98','0x7ef9db22',31,
+                '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
+                128,125,
+                '0xf4240000','0x3c0000',22,
+                '0xf424000000000000','0x3c000000000000',54,
+                15625,16,
+                '0x8637bd06','0x1fff79c842f',41,
+                '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
+                16,15625,
+        ], 1200 => [
+                '0xd5555556','0xd5555555',32,
+                '0xd555555555555556','0xd555555555555555',64,
+                5,6,
+                '0x9999999a','0x66666666',31,
+                '0x999999999999999a','0x6666666666666666',63,
+                6,5,
+                '0xd0555556','0x2aaaaa',22,
+                '0xd055555555555556','0x2aaaaaaaaaaaaa',54,
+                2500,3,
+                '0x9d495183','0x1ffcb923a29',41,
+                '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
+                3,2500,
+        ]
+);
+$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
+sub bint($)
+{
+        my($x) = @_;
+        return Math::BigInt->new($x);
+}
+#
+# Constants for division by reciprocal multiplication.
+# (bits, numerator, denominator)
+#
+sub fmul($$$)
+{
+        my ($b,$n,$d) = @_;
+        $n = bint($n);
+        $d = bint($d);
+        return scalar (($n << $b)+$d-bint(1))/$d;
+}
+sub fadj($$$)
+{
+        my($b,$n,$d) = @_;
+        $n = bint($n);
+        $d = bint($d);
+        $d = $d/bgcd($n, $d);
+        return scalar (($d-bint(1)) << $b)/$d;
+}
+sub fmuls($$$) {
+        my($b,$n,$d) = @_;
+        my($s,$m);
+        my($thres) = bint(1) << ($b-1);
+        $n = bint($n);
+        $d = bint($d);
+        for ($s = 0; 1; $s++) {
+                $m = fmul($s,$n,$d);
+                return $s if ($m >= $thres);
+        }
+        return 0;
+}
+# Provides mul, adj, and shr factors for a specific
+# (bit, time, hz) combination
+sub muladj($$$) {
+        my($b, $t, $hz) = @_;
+        my $s = fmuls($b, $t, $hz);
+        my $m = fmul($s, $t, $hz);
+        my $a = fadj($s, $t, $hz);
+        return ($m->as_hex(), $a->as_hex(), $s);
+}
+# Provides numerator, denominator values
+sub numden($$) {
+        my($n, $d) = @_;
+        my $g = bgcd($n, $d);
+        return ($n/$g, $d/$g);
+}
+# All values for a specific (time, hz) combo
+sub conversions($$) {
+        my ($t, $hz) = @_;
+        my @val = ();
+        # HZ_TO_xx
+        push(@val, muladj(32, $t, $hz));
+        push(@val, muladj(64, $t, $hz));
+        push(@val, numden($t, $hz));
+        # xx_TO_HZ
+        push(@val, muladj(32, $hz, $t));
+        push(@val, muladj(64, $hz, $t));
+        push(@val, numden($hz, $t));
+        return @val;
+}
+sub compute_values($) {
+        my($hz) = @_;
+        my @val = ();
+        my $s, $m, $a, $g;
+        if (!$has_bigint) {
+                die "$0: HZ == $hz not canned and ".
+                    "Math::BigInt not available\n";
+        }
+        # MSEC conversions
+        push(@val, conversions(1000, $hz));
+        # USEC conversions
+        push(@val, conversions(1000000, $hz));
+        return @val;
+}
+sub output($@)
+{
+        my($hz, @val) = @_;
+        my $pfx, $bit, $suf, $s, $m, $a;
+        print "/* Automatically generated by kernel/timeconst.pl */\n";
+        print "/* Conversion constants for HZ == $hz */\n";
+        print "\n";
+        print "#ifndef KERNEL_TIMECONST_H\n";
+        print "#define KERNEL_TIMECONST_H\n";
+        print "\n";
+        print "#include <linux/param.h>\n";
+        print "\n";
+        print "#if HZ != $hz\n";
+        print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
+        print "#endif\n";
+        print "\n";
+        foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
+                      'HZ_TO_USEC','USEC_TO_HZ') {
+                foreach $bit (32, 64) {
+                        foreach $suf ('MUL', 'ADJ', 'SHR') {
+                                printf "#define %-23s %s\n",
+                                        "${pfx}_$suf$bit", shift(@val);
+                        }
+                }
+                foreach $suf ('NUM', 'DEN') {
+                        printf "#define %-23s %s\n",
+                                "${pfx}_$suf", shift(@val);
+                }
+        }
+        print "\n";
+        print "#endif /* KERNEL_TIMECONST_H */\n";
+}
+($hz) = @ARGV;
+# Use this to generate the %canned_values structure
+if ($hz eq '--can') {
+        shift(@ARGV);
+        @hzlist = sort {$a <=> $b} (@ARGV);
+        print "# Precomputed values for systems without Math::BigInt\n";
+        print "# Generated by:\n";
+        print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
+        print "\%canned_values = (\n";
+        my $pf = "\t";
+        foreach $hz (@hzlist) {
+                my @values = compute_values($hz);
+                print "$pf$hz => [\n";
+                while (scalar(@values)) {
+                        my $bit;
+                        foreach $bit (32, 64) {
+                                my $m = shift(@values);
+                                my $a = shift(@values);
+                                my $s = shift(@values);
+                                print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n";
+                        }
+                        my $n = shift(@values);
+                        my $d = shift(@values);
+                        print "\t\t",$n,',',$d,",\n";
+                }
+                print "\t]";
+                $pf = ', ';
+        }
+        print "\n);\n";
+} else {
+        $hz += 0;                       # Force to number
+        if ($hz < 1) {
+                die "Usage: $0 HZ\n";
+        }
+        @val = @{$canned_values{$hz}};
+        if (!defined(@val)) {
+                @val = compute_values($hz);
+        }
+        output($hz, @val);
+}
+exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 9fbb472b8cf0..99b00a25f88b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -327,7 +327,7 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
 * init_timer() must be done to a timer prior calling *any* of the
 * other timer functions.
 */
-void fastcall init_timer(struct timer_list *timer)
+void init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
        timer->base = __raw_get_cpu_var(tvec_bases);
@@ -339,7 +339,7 @@ void fastcall init_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(init_timer);
-void fastcall init_timer_deferrable(struct timer_list *timer)
+void init_timer_deferrable(struct timer_list *timer)
 {
        init_timer(timer);
        timer_set_deferrable(timer);
@@ -818,12 +818,14 @@ unsigned long next_timer_interrupt(void)
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 void account_process_tick(struct task_struct *p, int user_tick)
 {
+        cputime_t one_jiffy = jiffies_to_cputime(1);
        if (user_tick) {
-                account_user_time(p, jiffies_to_cputime(1));
+                account_user_time(p, one_jiffy);
-                account_user_time_scaled(p, jiffies_to_cputime(1));
+                account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
        } else {
-                account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+                account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
-                account_system_time_scaled(p, jiffies_to_cputime(1));
+                account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
        }
 }
 #endif
@@ -977,7 +979,7 @@ asmlinkage long sys_getppid(void)
        int pid;
        rcu_read_lock();
-        pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns);
+        pid = task_tgid_vnr(current->real_parent);
        rcu_read_unlock();
        return pid;
@@ -1040,7 +1042,7 @@ static void process_timeout(unsigned long __data)
 *
 * In all cases the return value is guaranteed to be non-negative.
 */
-fastcall signed long __sched schedule_timeout(signed long timeout)
+signed long __sched schedule_timeout(signed long timeout)
 {
        struct timer_list timer;
        unsigned long expire;
diff --git a/kernel/user.c b/kernel/user.c
index bc1c48d35cb3..7132022a040c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,6 +17,14 @@
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+struct user_namespace init_user_ns = {
+        .kref = {
+                .refcount       = ATOMIC_INIT(2),
+        },
+        .root_user = &root_user,
+};
+EXPORT_SYMBOL_GPL(init_user_ns);
 /*
 * UID task count cache, to get fast user lookup in "alloc_uid"
 * when changing user ID's (ie setuid() and friends).
@@ -49,7 +57,7 @@ struct user_struct root_user = {
        .uid_keyring    = &root_user_keyring,
        .session_keyring = &root_session_keyring,
 #endif
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
        .tg             = &init_task_group,
 #endif
 };
@@ -82,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
        return NULL;
 }
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 static void sched_destroy_user(struct user_struct *up)
 {
@@ -105,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
        sched_move_task(p);
 }
-#else   /* CONFIG_FAIR_USER_SCHED */
+#else   /* CONFIG_USER_SCHED */
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
 static void sched_switch_user(struct task_struct *p) { }
-#endif  /* CONFIG_FAIR_USER_SCHED */
+#endif  /* CONFIG_USER_SCHED */
-#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
 static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
@@ -129,6 +137,7 @@ static inline void uids_mutex_unlock(void)
 }
 /* uid directory attributes */
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static ssize_t cpu_shares_show(struct kobject *kobj,
                               struct kobj_attribute *attr,
                               char *buf)
@@ -155,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
 static struct kobj_attribute cpu_share_attr =
        __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   char *buf)
+{
+        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+        return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    const char *buf, size_t size)
+{
+        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+        unsigned long rt_runtime;
+        int rc;
+        sscanf(buf, "%lu", &rt_runtime);
+        rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+        return (rc ? rc : size);
+}
+static struct kobj_attribute cpu_rt_runtime_attr =
+        __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+#endif
 /* default attributes per uid directory */
 static struct attribute *uids_attributes[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
        &cpu_share_attr.attr,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        &cpu_rt_runtime_attr.attr,
+#endif
        NULL
 };
@@ -261,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
        schedule_work(&up->work);
 }
-#else   /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
+#else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
 int uids_sysfs_init(void) { return 0; }
 static inline int uids_user_create(struct user_struct *up) { return 0; }
@@ -365,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
-                        /* This case is not possible when CONFIG_FAIR_USER_SCHED
+                        /* This case is not possible when CONFIG_USER_SCHED
                         * is defined, since we serialize alloc_uid() using
                         * uids_mutex. Hence no need to call
                         * sched_destroy_user() or remove_user_sysfs_dir().
@@ -427,6 +471,7 @@ void switch_uid(struct user_struct *new_user)
        suid_keys(current);
 }
+#ifdef CONFIG_USER_NS
 void release_uids(struct user_namespace *ns)
 {
        int i;
@@ -451,6 +496,7 @@ void release_uids(struct user_namespace *ns)
        free_uid(ns->root_user);
 }
+#endif
 static int __init uid_cache_init(void)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 7af90fc4f0fd..4c9006275df7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -10,17 +10,6 @@
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
-struct user_namespace init_user_ns = {
-        .kref = {
-                .refcount       = ATOMIC_INIT(2),
-        },
-        .root_user = &root_user,
-};
-EXPORT_SYMBOL_GPL(init_user_ns);
-#ifdef CONFIG_USER_NS
 /*
 * Clone a new ns copying an original user ns, setting refcount to 1
 * @old_ns: namespace to clone
@@ -84,5 +73,3 @@ void free_user_ns(struct kref *kref)
        release_uids(ns);
        kfree(ns);
 }
-#endif /* CONFIG_USER_NS */
diff --git a/kernel/wait.c b/kernel/wait.c
index f9876888a569..c275c56cf2d3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -18,7 +18,7 @@ void init_waitqueue_head(wait_queue_head_t *q)
 EXPORT_SYMBOL(init_waitqueue_head);
-void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
        unsigned long flags;
@@ -29,7 +29,7 @@ void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(add_wait_queue);
-void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
 {
        unsigned long flags;
@@ -40,7 +40,7 @@ void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
-void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
        unsigned long flags;
@@ -63,7 +63,7 @@ EXPORT_SYMBOL(remove_wait_queue);
 * stops them from bleeding out - it would still allow subsequent
 * loads to move into the critical region).
 */
-void fastcall
+void
 prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
 {
        unsigned long flags;
@@ -82,7 +82,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait);
-void fastcall
+void
 prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 {
        unsigned long flags;
@@ -101,7 +101,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
-void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 {
        unsigned long flags;
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(wake_bit_function);
 * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
 * permitted return codes. Nonzero return codes halt waiting and return.
 */
-int __sched fastcall
+int __sched
 __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
                        int (*action)(void *), unsigned mode)
 {
@@ -173,7 +173,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
 }
 EXPORT_SYMBOL(__wait_on_bit);
-int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
+int __sched out_of_line_wait_on_bit(void *word, int bit,
                                        int (*action)(void *), unsigned mode)
 {
        wait_queue_head_t *wq = bit_waitqueue(word, bit);
@@ -183,7 +183,7 @@ int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit);
-int __sched fastcall
+int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
                        int (*action)(void *), unsigned mode)
 {
@@ -201,7 +201,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
 }
 EXPORT_SYMBOL(__wait_on_bit_lock);
-int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
                                        int (*action)(void *), unsigned mode)
 {
        wait_queue_head_t *wq = bit_waitqueue(word, bit);
@@ -211,7 +211,7 @@ int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
-void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
 {
        struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
        if (waitqueue_active(wq))
@@ -236,13 +236,13 @@ EXPORT_SYMBOL(__wake_up_bit);
 * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
 * because spin_unlock() does not guarantee a memory barrier.
 */
-void fastcall wake_up_bit(void *word, int bit)
+void wake_up_bit(void *word, int bit)
 {
        __wake_up_bit(bit_waitqueue(word, bit), word, bit);
 }
 EXPORT_SYMBOL(wake_up_bit);
-fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit)
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
 {
        const int shift = BITS_PER_LONG == 32 ? 5 : 6;
        const struct zone *zone = page_zone(virt_to_page(word));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52db48e7f6e7..ff06611655af 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -161,7 +161,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 * We queue the work to the CPU it was submitted, but there is no
 * guarantee that it will be processed by that CPU.
 */
-int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
+int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
        int ret = 0;
@@ -175,7 +175,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(queue_work);
-void delayed_work_timer_fn(unsigned long __data)
+static void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
        struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
@@ -192,7 +192,7 @@ void delayed_work_timer_fn(unsigned long __data)
 *
 * Returns 0 if @work was already on a queue, non-zero otherwise.
 */
-int fastcall queue_delayed_work(struct workqueue_struct *wq,
+int queue_delayed_work(struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay)
 {
        timer_stats_timer_set_start_info(&dwork->timer);
@@ -388,7 +388,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 * This function used to run the workqueues itself.  Now we just wait for the
 * helper threads to do it.
 */
-void fastcall flush_workqueue(struct workqueue_struct *wq)
+void flush_workqueue(struct workqueue_struct *wq)
 {
        const cpumask_t *cpu_map = wq_cpu_map(wq);
        int cpu;
@@ -546,7 +546,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
 *
 * This puts a job in the kernel-global workqueue.
 */
-int fastcall schedule_work(struct work_struct *work)
+int schedule_work(struct work_struct *work)
 {
        return queue_work(keventd_wq, work);
 }
@@ -560,7 +560,7 @@ EXPORT_SYMBOL(schedule_work);
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
-int fastcall schedule_delayed_work(struct delayed_work *dwork,
+int schedule_delayed_work(struct delayed_work *dwork,
                                        unsigned long delay)
 {
        timer_stats_timer_set_start_info(&dwork->timer);
author	Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>	2008-02-17 21:51:42 -0500
committer	Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>	2008-02-17 21:51:42 -0500
commit	c58310bf4933986513020fa90b4190c7492995ae (patch)
tree	143f2c7578d02ebef5db8fc57ae69e951ae0e2ee /kernel
parent	269cdfaf769f5cd831284cc831790c7c5038040f (diff)
parent	1309d4e68497184d2fd87e892ddf14076c2bda98 (diff)