14 files changed, 253 insertions, 152 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..77770a034d59 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
        struct task_struct *tsk;
        int err;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        tsk = find_task_by_vpid(pid);
-        err = -ESRCH;
+        if (!tsk) {
-        if (!tsk)
+                rcu_read_unlock();
-                goto out;
+                return -ESRCH;
-        err = 0;
+        }
+        get_task_struct(tsk);
-        spin_lock_irq(&tsk->sighand->siglock);
+        rcu_read_unlock();
-        if (!tsk->signal->audit_tty)
+        err = tty_audit_push_task(tsk, loginuid, sessionid);
-                err = -EPERM;
+        put_task_struct(tsk);
-        spin_unlock_irq(&tsk->sighand->siglock);
-        if (err)
-                goto out;
-        tty_audit_push_task(tsk, loginuid, sessionid);
-out:
-        read_unlock(&tasklist_lock);
        return err;
 }
@@ -506,7 +499,7 @@ int audit_send_list(void *_dest)
 }
 struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
-                                 int multi, void *payload, int size)
+                                 int multi, const void *payload, int size)
 {
        struct sk_buff  *skb;
        struct nlmsghdr *nlh;
@@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg)
 * Allocates an skb, builds the netlink message, and sends it to the pid.
 * No failure notifications.
 */
-void audit_send_reply(int pid, int seq, int type, int done, int multi,
+static void audit_send_reply(int pid, int seq, int type, int done, int multi,
-                      void *payload, int size)
+                             const void *payload, int size)
 {
        struct sk_buff *skb;
        struct task_struct *tsk;
@@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_TTY_GET: {
                struct audit_tty_status s;
                struct task_struct *tsk;
+                unsigned long flags;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk)
+                if (tsk && lock_task_sighand(tsk, &flags)) {
-                        err = -ESRCH;
-                else {
-                        spin_lock_irq(&tsk->sighand->siglock);
                        s.enabled = tsk->signal->audit_tty != 0;
-                        spin_unlock_irq(&tsk->sighand->siglock);
+                        unlock_task_sighand(tsk, &flags);
-                }
+                } else
-                read_unlock(&tasklist_lock);
+                        err = -ESRCH;
-                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
+                rcu_read_unlock();
-                                 &s, sizeof(s));
+                if (!err)
+                        audit_send_reply(NETLINK_CB(skb).pid, seq,
+                                         AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_TTY_SET: {
                struct audit_tty_status *s;
                struct task_struct *tsk;
+                unsigned long flags;
                if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
                        return -EINVAL;
                s = data;
                if (s->enabled != 0 && s->enabled != 1)
                        return -EINVAL;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk)
+                if (tsk && lock_task_sighand(tsk, &flags)) {
-                        err = -ESRCH;
-                else {
-                        spin_lock_irq(&tsk->sighand->siglock);
                        tsk->signal->audit_tty = s->enabled != 0;
-                        spin_unlock_irq(&tsk->sighand->siglock);
+                        unlock_task_sighand(tsk, &flags);
-                }
+                } else
-                read_unlock(&tasklist_lock);
+                        err = -ESRCH;
+                rcu_read_unlock();
                break;
        }
        default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
                                    int *dirlen);
 extern struct sk_buff *     audit_make_reply(int pid, int seq, int type,
                                             int done, int multi,
-                                             void *payload, int size);
+                                             const void *payload, int size);
-extern void                 audit_send_reply(int pid, int seq, int type,
-                                             int done, int multi,
-                                             void *payload, int size);
 extern void                 audit_panic(const char *message);
 struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..37b2bea170c8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
 {
        struct audit_chunk *chunk = find_chunk(p);
        struct fsnotify_mark *entry = &chunk->mark;
-        struct audit_chunk *new;
+        struct audit_chunk *new = NULL;
        struct audit_tree *owner;
        int size = chunk->count - 1;
        int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
        spin_unlock(&hash_lock);
+        if (size)
+                new = alloc_chunk(size);
        spin_lock(&entry->lock);
        if (chunk->dead || !entry->i.inode) {
                spin_unlock(&entry->lock);
+                if (new)
+                        free_chunk(new);
                goto out;
        }
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
                goto out;
        }
-        new = alloc_chunk(size);
        if (!new)
                goto Fallback;
        fsnotify_duplicate_mark(&new->mark, entry);
        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
                free_chunk(new);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..d2e3c7866460 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
 };
 /* fsnotify handle. */
-struct fsnotify_group *audit_watch_group;
+static struct fsnotify_group *audit_watch_group;
 /* fsnotify events we care about. */
 #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
        }
 }
-void audit_remove_watch(struct audit_watch *watch)
+static void audit_remove_watch(struct audit_watch *watch)
 {
        list_del(&watch->wlist);
        audit_put_parent(watch->parent);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..add2819af71b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
                case AUDIT_LOGINUID:
                        result = audit_comparator(cb->loginuid, f->op, f->val);
                        break;
+                case AUDIT_SUBJ_USER:
+                case AUDIT_SUBJ_ROLE:
+                case AUDIT_SUBJ_TYPE:
+                case AUDIT_SUBJ_SEN:
+                case AUDIT_SUBJ_CLR:
+                        if (f->lsm_rule)
+                                result = security_audit_rule_match(cb->sid,
+                                                                   f->type,
+                                                                   f->op,
+                                                                   f->lsm_rule,
+                                                                   NULL);
+                        break;
                }
                if (!result)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..f49a0318c2ed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
                        pid_t                   pid;
                        struct audit_cap_data   cap;
                } capset;
+                struct {
+                        int                     fd;
+                        int                     flags;
+                } mmap;
        };
        int fds[2];
@@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic)
                audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
                audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
                break; }
+        case AUDIT_MMAP: {
+                audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
+                                 context->mmap.flags);
+                break; }
        }
        audit_log_end(ab);
 }
@@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid,
        context->type = AUDIT_CAPSET;
 }
+void __audit_mmap_fd(int fd, int flags)
+{
+        struct audit_context *context = current->audit_context;
+        context->mmap.fd = fd;
+        context->mmap.flags = flags;
+        context->type = AUDIT_MMAP;
+}
 /**
 * audit_core_dumps - record information about processes that end abnormally
 * @signr: signal value
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5cf366965d0c..66a416b42c18 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1460,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
        return 0;
 }
-static int cgroup_get_sb(struct file_system_type *fs_type,
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
-                         void *data, struct vfsmount *mnt)
+                         void *data)
 {
        struct cgroup_sb_opts opts;
        struct cgroupfs_root *root;
@@ -1596,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                drop_parsed_module_refcounts(opts.subsys_bits);
        }
-        simple_set_mnt(mnt, sb);
        kfree(opts.release_agent);
        kfree(opts.name);
-        return 0;
+        return dget(sb->s_root);
 drop_new_super:
        deactivate_locked_super(sb);
@@ -1608,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
-        return ret;
+        return ERR_PTR(ret);
 }
 static void cgroup_kill_sb(struct super_block *sb) {
@@ -1658,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 static struct file_system_type cgroup_fs_type = {
        .name = "cgroup",
-        .get_sb = cgroup_get_sb,
+        .mount = cgroup_mount,
        .kill_sb = cgroup_kill_sb,
 };
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 51b143e2a07a..4349935c2ad8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
 */
-static int cpuset_get_sb(struct file_system_type *fs_type,
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
-                         int flags, const char *unused_dev_name,
+                         int flags, const char *unused_dev_name, void *data)
-                         void *data, struct vfsmount *mnt)
 {
        struct file_system_type *cgroup_fs = get_fs_type("cgroup");
-        int ret = -ENODEV;
+        struct dentry *ret = ERR_PTR(-ENODEV);
        if (cgroup_fs) {
                char mountopts[] =
                        "cpuset,noprefix,"
                        "release_agent=/sbin/cpuset_release_agent";
-                ret = cgroup_fs->get_sb(cgroup_fs, flags,
+                ret = cgroup_fs->mount(cgroup_fs, flags,
-                                           unused_dev_name, mountopts, mnt);
+                                           unused_dev_name, mountopts);
                put_filesystem(cgroup_fs);
        }
        return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
 static struct file_system_type cpuset_fs_type = {
        .name = "cpuset",
-        .get_sb = cpuset_get_sb,
+        .mount = cpuset_mount,
 };
 /*
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index fec596da9bd0..cefd4a11f6d9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -209,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
        return 0;
 }
-/**
- *      kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
- *      @regs: Current &struct pt_regs.
- *
- *      This function will be called if the particular architecture must
- *      disable hardware debugging while it is processing gdb packets or
- *      handling exception.
- */
-void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
-{
-}
 /*
 * Some architectures need cache flushes when we set/clear a
 * breakpoint:
@@ -484,7 +472,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
                atomic_inc(&masters_in_kgdb);
        else
                atomic_inc(&slaves_in_kgdb);
-        kgdb_disable_hw_debug(ks->linux_regs);
+        if (arch_kgdb_ops.disable_hw_break)
+                arch_kgdb_ops.disable_hw_break(regs);
 acquirelock:
        /*
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index d7bda21a106b..37755d621924 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                /* special case below */
        } else {
                kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
-                           kdb_current, kdb_current->pid);
+                           kdb_current, kdb_current ? kdb_current->pid : 0);
 #if defined(CONFIG_SMP)
                kdb_printf("on processor %d ", raw_smp_processor_id());
 #endif
@@ -2603,20 +2603,17 @@ static int kdb_summary(int argc, const char **argv)
 */
 static int kdb_per_cpu(int argc, const char **argv)
 {
-        char buf[256], fmtstr[64];
+        char fmtstr[64];
-        kdb_symtab_t symtab;
+        int cpu, diag, nextarg = 1;
-        cpumask_t suppress = CPU_MASK_NONE;
+        unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
-        int cpu, diag;
-        unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
        if (argc < 1 || argc > 3)
                return KDB_ARGCOUNT;
-        snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
+        diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
-        if (!kdbgetsymval(buf, &symtab)) {
+        if (diag)
-                kdb_printf("%s is not a per_cpu variable\n", argv[1]);
+                return diag;
-                return KDB_BADADDR;
-        }
        if (argc >= 2) {
                diag = kdbgetularg(argv[2], &bytesperword);
                if (diag)
@@ -2649,46 +2646,25 @@ static int kdb_per_cpu(int argc, const char **argv)
 #define KDB_PCU(cpu) 0
 #endif
 #endif
        for_each_online_cpu(cpu) {
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
                if (whichcpu != ~0UL && whichcpu != cpu)
                        continue;
-                addr = symtab.sym_start + KDB_PCU(cpu);
+                addr = symaddr + KDB_PCU(cpu);
                diag = kdb_getword(&val, addr, bytesperword);
                if (diag) {
                        kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
                                   "read, diag=%d\n", cpu, addr, diag);
                        continue;
                }
-#ifdef  CONFIG_SMP
-                if (!val) {
-                        cpu_set(cpu, suppress);
-                        continue;
-                }
-#endif  /* CONFIG_SMP */
                kdb_printf("%5d ", cpu);
                kdb_md_line(fmtstr, addr,
                        bytesperword == KDB_WORD_SIZE,
                        1, bytesperword, 1, 1, 0);
        }
-        if (cpus_weight(suppress) == 0)
-                return 0;
-        kdb_printf("Zero suppressed cpu(s):");
-        for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
-             cpu = next_cpu(cpu, suppress)) {
-                kdb_printf(" %d", cpu);
-                if (cpu == num_possible_cpus() - 1 ||
-                    next_cpu(cpu, suppress) != cpu + 1)
-                        continue;
-                while (cpu < num_possible_cpus() &&
-                       next_cpu(cpu, suppress) == cpu + 1)
-                        ++cpu;
-                kdb_printf("-%d", cpu);
-        }
-        kdb_printf("\n");
 #undef KDB_PCU
        return 0;
 }
diff --git a/kernel/resource.c b/kernel/resource.c
index 9c9841cb6902..9fad33efd0db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);
 static DEFINE_RWLOCK(resource_lock);
+/*
+ * By default, we allocate free space bottom-up.  The architecture can request
+ * top-down by clearing this flag.  The user can override the architecture's
+ * choice with the "resource_alloc_from_bottom" kernel boot option, but that
+ * should only be a debugging tool.
+ */
+int resource_alloc_from_bottom = 1;
+static __init int setup_alloc_from_bottom(char *s)
+{
+        printk(KERN_INFO
+               "resource: allocating from bottom-up; please report a bug\n");
+        resource_alloc_from_bottom = 1;
+        return 0;
+}
+early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)
        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
 }
+static resource_size_t simple_align_resource(void *data,
+                                             const struct resource *avail,
+                                             resource_size_t size,
+                                             resource_size_t align)
+{
+        return avail->start;
+}
+static void resource_clip(struct resource *res, resource_size_t min,
+                          resource_size_t max)
+{
+        if (res->start < min)
+                res->start = min;
+        if (res->end > max)
+                res->end = max;
+}
+static bool resource_contains(struct resource *res1, struct resource *res2)
+{
+        return res1->start <= res2->start && res1->end >= res2->end;
+}
+/*
+ * Find the resource before "child" in the sibling list of "root" children.
+ */
+static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
+{
+        struct resource *this;
+        for (this = root->child; this; this = this->sibling)
+                if (this->sibling == child)
+                        return this;
+        return NULL;
+}
 /*
 * Find empty slot in the resource tree given range and alignment.
+ * This version allocates from the end of the root resource first.
+ */
+static int find_resource_from_top(struct resource *root, struct resource *new,
+                                  resource_size_t size, resource_size_t min,
+                                  resource_size_t max, resource_size_t align,
+                                  resource_size_t (*alignf)(void *,
+                                                   const struct resource *,
+                                                   resource_size_t,
+                                                   resource_size_t),
+                                  void *alignf_data)
+{
+        struct resource *this;
+        struct resource tmp, avail, alloc;
+        tmp.start = root->end;
+        tmp.end = root->end;
+        this = find_sibling_prev(root, NULL);
+        for (;;) {
+                if (this) {
+                        if (this->end < root->end)
+                                tmp.start = this->end + 1;
+                } else
+                        tmp.start = root->start;
+                resource_clip(&tmp, min, max);
+                /* Check for overflow after ALIGN() */
+                avail = *new;
+                avail.start = ALIGN(tmp.start, align);
+                avail.end = tmp.end;
+                if (avail.start >= tmp.start) {
+                        alloc.start = alignf(alignf_data, &avail, size, align);
+                        alloc.end = alloc.start + size - 1;
+                        if (resource_contains(&avail, &alloc)) {
+                                new->start = alloc.start;
+                                new->end = alloc.end;
+                                return 0;
+                        }
+                }
+                if (!this || this->start == root->start)
+                        break;
+                tmp.end = this->start - 1;
+                this = find_sibling_prev(root, this);
+        }
+        return -EBUSY;
+}
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ * This version allocates from the beginning of the root resource first.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,
                         void *alignf_data)
 {
        struct resource *this = root->child;
-        struct resource tmp = *new;
+        struct resource tmp = *new, avail, alloc;
        tmp.start = root->start;
        /*
-         * Skip past an allocated resource that starts at 0, since the assignment
+         * Skip past an allocated resource that starts at 0, since the
-         * of this->start - 1 to tmp->end below would cause an underflow.
+         * assignment of this->start - 1 to tmp->end below would cause an
+         * underflow.
         */
        if (this && this->start == 0) {
                tmp.start = this->end + 1;
                this = this->sibling;
        }
-        for(;;) {
+        for (;;) {
                if (this)
                        tmp.end = this->start - 1;
                else
                        tmp.end = root->end;
-                if (tmp.start < min)
-                        tmp.start = min;
+                resource_clip(&tmp, min, max);
-                if (tmp.end > max)
-                        tmp.end = max;
+                /* Check for overflow after ALIGN() */
-                tmp.start = ALIGN(tmp.start, align);
+                avail = *new;
-                if (alignf)
+                avail.start = ALIGN(tmp.start, align);
-                        tmp.start = alignf(alignf_data, &tmp, size, align);
+                avail.end = tmp.end;
-                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
+                if (avail.start >= tmp.start) {
-                        new->start = tmp.start;
+                        alloc.start = alignf(alignf_data, &avail, size, align);
-                        new->end = tmp.start + size - 1;
+                        alloc.end = alloc.start + size - 1;
-                        return 0;
+                        if (resource_contains(&avail, &alloc)) {
+                                new->start = alloc.start;
+                                new->end = alloc.end;
+                                return 0;
+                        }
                }
                if (!this)
                        break;
                tmp.start = this->end + 1;
                this = this->sibling;
        }
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,
 {
        int err;
+        if (!alignf)
+                alignf = simple_align_resource;
        write_lock(&resource_lock);
-        err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        if (resource_alloc_from_bottom)
+                err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        else
+                err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index d42992bccdfa..aa14a56f9d03 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8510,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->put_prev_task(rq, tsk);
-        set_task_rq(tsk, task_cpu(tsk));
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        if (tsk->sched_class->moved_group)
+        if (tsk->sched_class->task_move_group)
-                tsk->sched_class->moved_group(tsk, on_rq);
+                tsk->sched_class->task_move_group(tsk, on_rq);
+        else
 #endif
+                set_task_rq(tsk, task_cpu(tsk));
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 933f3d1b62ea..f4f6a8326dd0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3869,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        /*
+         * If the task was not on the rq at the time of this cgroup movement
-        update_curr(cfs_rq);
+         * it must have been asleep, sleeping tasks keep their ->vruntime
+         * absolute on their old rq until wakeup (needed for the fair sleeper
+         * bonus in place_entity()).
+         *
+         * If it was on the rq, we've just 'preempted' it, which does convert
+         * ->vruntime to a relative base.
+         *
+         * Make sure both cases convert their relative position when migrating
+         * to another cgroup's rq. This does somewhat interfere with the
+         * fair sleeper stuff for the first placement, but who cares.
+         */
+        if (!on_rq)
+                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+        set_task_rq(p, task_cpu(p));
        if (!on_rq)
-                place_entity(cfs_rq, &p->se, 1);
+                p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
 #endif
@@ -3927,7 +3940,7 @@ static const struct sched_class fair_sched_class = {
        .get_rr_interval        = get_rr_interval_fair,
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        .moved_group            = moved_group_fair,
+        .task_move_group        = task_move_group_fair,
 #endif
 };
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..48ddf431db0e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 }
 /*
- * Called when a process is dequeued from the active array and given
+ * We are interested in knowing how long it was from the *first* time a
- * the cpu.  We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue.  (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * Though we are interested in knowing how long it was from the *first* time a
 * task was queued to the time that it finally hit a cpu, we call this routine
 * from dequeue_task() to account for possible rq->clock skew across cpus. The
 * delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
 }
 /*
- * Called when a process is queued into either the active or expired
- * array.  The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu.  Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
- *
 * This function is only called from enqueue_task(), but also only updates
 * the timestamp if it is already not set.  It's assumed that
 * sched_info_dequeued() will clear that stamp when appropriate.