61 files changed, 1265 insertions, 749 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..77770a034d59 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
        struct task_struct *tsk;
        int err;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        tsk = find_task_by_vpid(pid);
-        err = -ESRCH;
+        if (!tsk) {
-        if (!tsk)
+                rcu_read_unlock();
-                goto out;
+                return -ESRCH;
-        err = 0;
+        }
+        get_task_struct(tsk);
-        spin_lock_irq(&tsk->sighand->siglock);
+        rcu_read_unlock();
-        if (!tsk->signal->audit_tty)
+        err = tty_audit_push_task(tsk, loginuid, sessionid);
-                err = -EPERM;
+        put_task_struct(tsk);
-        spin_unlock_irq(&tsk->sighand->siglock);
-        if (err)
-                goto out;
-        tty_audit_push_task(tsk, loginuid, sessionid);
-out:
-        read_unlock(&tasklist_lock);
        return err;
 }
@@ -506,7 +499,7 @@ int audit_send_list(void *_dest)
 }
 struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
-                                 int multi, void *payload, int size)
+                                 int multi, const void *payload, int size)
 {
        struct sk_buff  *skb;
        struct nlmsghdr *nlh;
@@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg)
 * Allocates an skb, builds the netlink message, and sends it to the pid.
 * No failure notifications.
 */
-void audit_send_reply(int pid, int seq, int type, int done, int multi,
+static void audit_send_reply(int pid, int seq, int type, int done, int multi,
-                      void *payload, int size)
+                             const void *payload, int size)
 {
        struct sk_buff *skb;
        struct task_struct *tsk;
@@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_TTY_GET: {
                struct audit_tty_status s;
                struct task_struct *tsk;
+                unsigned long flags;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk)
+                if (tsk && lock_task_sighand(tsk, &flags)) {
-                        err = -ESRCH;
-                else {
-                        spin_lock_irq(&tsk->sighand->siglock);
                        s.enabled = tsk->signal->audit_tty != 0;
-                        spin_unlock_irq(&tsk->sighand->siglock);
+                        unlock_task_sighand(tsk, &flags);
-                }
+                } else
-                read_unlock(&tasklist_lock);
+                        err = -ESRCH;
-                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
+                rcu_read_unlock();
-                                 &s, sizeof(s));
+                if (!err)
+                        audit_send_reply(NETLINK_CB(skb).pid, seq,
+                                         AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_TTY_SET: {
                struct audit_tty_status *s;
                struct task_struct *tsk;
+                unsigned long flags;
                if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
                        return -EINVAL;
                s = data;
                if (s->enabled != 0 && s->enabled != 1)
                        return -EINVAL;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk)
+                if (tsk && lock_task_sighand(tsk, &flags)) {
-                        err = -ESRCH;
-                else {
-                        spin_lock_irq(&tsk->sighand->siglock);
                        tsk->signal->audit_tty = s->enabled != 0;
-                        spin_unlock_irq(&tsk->sighand->siglock);
+                        unlock_task_sighand(tsk, &flags);
-                }
+                } else
-                read_unlock(&tasklist_lock);
+                        err = -ESRCH;
+                rcu_read_unlock();
                break;
        }
        default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
                                    int *dirlen);
 extern struct sk_buff *     audit_make_reply(int pid, int seq, int type,
                                             int done, int multi,
-                                             void *payload, int size);
+                                             const void *payload, int size);
-extern void                 audit_send_reply(int pid, int seq, int type,
-                                             int done, int multi,
-                                             void *payload, int size);
 extern void                 audit_panic(const char *message);
 struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..37b2bea170c8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
 {
        struct audit_chunk *chunk = find_chunk(p);
        struct fsnotify_mark *entry = &chunk->mark;
-        struct audit_chunk *new;
+        struct audit_chunk *new = NULL;
        struct audit_tree *owner;
        int size = chunk->count - 1;
        int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
        spin_unlock(&hash_lock);
+        if (size)
+                new = alloc_chunk(size);
        spin_lock(&entry->lock);
        if (chunk->dead || !entry->i.inode) {
                spin_unlock(&entry->lock);
+                if (new)
+                        free_chunk(new);
                goto out;
        }
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
                goto out;
        }
-        new = alloc_chunk(size);
        if (!new)
                goto Fallback;
        fsnotify_duplicate_mark(&new->mark, entry);
        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
                free_chunk(new);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..d2e3c7866460 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
 };
 /* fsnotify handle. */
-struct fsnotify_group *audit_watch_group;
+static struct fsnotify_group *audit_watch_group;
 /* fsnotify events we care about. */
 #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
        }
 }
-void audit_remove_watch(struct audit_watch *watch)
+static void audit_remove_watch(struct audit_watch *watch)
 {
        list_del(&watch->wlist);
        audit_put_parent(watch->parent);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..add2819af71b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
                case AUDIT_LOGINUID:
                        result = audit_comparator(cb->loginuid, f->op, f->val);
                        break;
+                case AUDIT_SUBJ_USER:
+                case AUDIT_SUBJ_ROLE:
+                case AUDIT_SUBJ_TYPE:
+                case AUDIT_SUBJ_SEN:
+                case AUDIT_SUBJ_CLR:
+                        if (f->lsm_rule)
+                                result = security_audit_rule_match(cb->sid,
+                                                                   f->type,
+                                                                   f->op,
+                                                                   f->lsm_rule,
+                                                                   NULL);
+                        break;
                }
                if (!result)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..f49a0318c2ed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
                        pid_t                   pid;
                        struct audit_cap_data   cap;
                } capset;
+                struct {
+                        int                     fd;
+                        int                     flags;
+                } mmap;
        };
        int fds[2];
@@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic)
                audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
                audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
                break; }
+        case AUDIT_MMAP: {
+                audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
+                                 context->mmap.flags);
+                break; }
        }
        audit_log_end(ab);
 }
@@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid,
        context->type = AUDIT_CAPSET;
 }
+void __audit_mmap_fd(int fd, int flags)
+{
+        struct audit_context *context = current->audit_context;
+        context->mmap.fd = fd;
+        context->mmap.flags = flags;
+        context->type = AUDIT_MMAP;
+}
 /**
 * audit_core_dumps - record information about processes that end abnormally
 * @signr: signal value
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7b69b8d0313d..66a416b42c18 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
+static int clone_children(const struct cgroup *cgrp)
+{
+        return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
 /*
 * for_each_subsys() allows you to iterate on each subsystem attached to
 * an active hierarchy
@@ -777,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
        struct inode *inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
@@ -1039,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noprefix");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+        if (clone_children(&root->top_cgroup))
+                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
                seq_printf(seq, ",name=%s", root->name);
        mutex_unlock(&cgroup_mutex);
@@ -1049,6 +1057,7 @@ struct cgroup_sb_opts {
        unsigned long subsys_bits;
        unsigned long flags;
        char *release_agent;
+        bool clone_children;
        char *name;
        /* User explicitly requested empty subsystem */
        bool none;
@@ -1065,7 +1074,8 @@ struct cgroup_sb_opts {
 */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
-        char *token, *o = data ?: "all";
+        char *token, *o = data;
+        bool all_ss = false, one_ss = false;
        unsigned long mask = (unsigned long)-1;
        int i;
        bool module_pin_failed = false;
@@ -1081,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        while ((token = strsep(&o, ",")) != NULL) {
                if (!*token)
                        return -EINVAL;
-                if (!strcmp(token, "all")) {
+                if (!strcmp(token, "none")) {
-                        /* Add all non-disabled subsystems */
-                        opts->subsys_bits = 0;
-                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                                struct cgroup_subsys *ss = subsys[i];
-                                if (ss == NULL)
-                                        continue;
-                                if (!ss->disabled)
-                                        opts->subsys_bits |= 1ul << i;
-                        }
-                } else if (!strcmp(token, "none")) {
                        /* Explicitly have no subsystems */
                        opts->none = true;
-                } else if (!strcmp(token, "noprefix")) {
+                        continue;
+                }
+                if (!strcmp(token, "all")) {
+                        /* Mutually exclusive option 'all' + subsystem name */
+                        if (one_ss)
+                                return -EINVAL;
+                        all_ss = true;
+                        continue;
+                }
+                if (!strcmp(token, "noprefix")) {
                        set_bit(ROOT_NOPREFIX, &opts->flags);
-                } else if (!strncmp(token, "release_agent=", 14)) {
+                        continue;
+                }
+                if (!strcmp(token, "clone_children")) {
+                        opts->clone_children = true;
+                        continue;
+                }
+                if (!strncmp(token, "release_agent=", 14)) {
                        /* Specifying two release agents is forbidden */
                        if (opts->release_agent)
                                return -EINVAL;
@@ -1104,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                                kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
                        if (!opts->release_agent)
                                return -ENOMEM;
-                } else if (!strncmp(token, "name=", 5)) {
+                        continue;
+                }
+                if (!strncmp(token, "name=", 5)) {
                        const char *name = token + 5;
                        /* Can't specify an empty name */
                        if (!strlen(name))
@@ -1126,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                                              GFP_KERNEL);
                        if (!opts->name)
                                return -ENOMEM;
-                } else {
-                        struct cgroup_subsys *ss;
+                        continue;
-                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                }
-                                ss = subsys[i];
-                                if (ss == NULL)
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                                        continue;
+                        struct cgroup_subsys *ss = subsys[i];
-                                if (!strcmp(token, ss->name)) {
+                        if (ss == NULL)
-                                        if (!ss->disabled)
+                                continue;
-                                                set_bit(i, &opts->subsys_bits);
+                        if (strcmp(token, ss->name))
-                                        break;
+                                continue;
-                                }
+                        if (ss->disabled)
-                        }
+                                continue;
-                        if (i == CGROUP_SUBSYS_COUNT)
-                                return -ENOENT;
+                        /* Mutually exclusive option 'all' + subsystem name */
+                        if (all_ss)
+                                return -EINVAL;
+                        set_bit(i, &opts->subsys_bits);
+                        one_ss = true;
+                        break;
+                }
+                if (i == CGROUP_SUBSYS_COUNT)
+                        return -ENOENT;
+        }
+        /*
+         * If the 'all' option was specified select all the subsystems,
+         * otherwise 'all, 'none' and a subsystem name options were not
+         * specified, let's default to 'all'
+         */
+        if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss == NULL)
+                                continue;
+                        if (ss->disabled)
+                                continue;
+                        set_bit(i, &opts->subsys_bits);
                }
        }
@@ -1354,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
                strcpy(root->release_agent_path, opts->release_agent);
        if (opts->name)
                strcpy(root->name, opts->name);
+        if (opts->clone_children)
+                set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
        return root;
 }
@@ -1417,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
        return 0;
 }
-static int cgroup_get_sb(struct file_system_type *fs_type,
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
-                         void *data, struct vfsmount *mnt)
+                         void *data)
 {
        struct cgroup_sb_opts opts;
        struct cgroupfs_root *root;
@@ -1553,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                drop_parsed_module_refcounts(opts.subsys_bits);
        }
-        simple_set_mnt(mnt, sb);
        kfree(opts.release_agent);
        kfree(opts.name);
-        return 0;
+        return dget(sb->s_root);
 drop_new_super:
        deactivate_locked_super(sb);
@@ -1565,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
-        return ret;
+        return ERR_PTR(ret);
 }
 static void cgroup_kill_sb(struct super_block *sb) {
@@ -1615,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 static struct file_system_type cgroup_fs_type = {
        .name = "cgroup",
-        .get_sb = cgroup_get_sb,
+        .mount = cgroup_mount,
        .kill_sb = cgroup_kill_sb,
 };
@@ -1879,6 +1921,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
 {
        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+        if (strlen(buffer) >= PATH_MAX)
+                return -EINVAL;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
        strcpy(cgrp->root->release_agent_path, buffer);
@@ -3172,6 +3216,23 @@ fail:
        return ret;
 }
+static u64 cgroup_clone_children_read(struct cgroup *cgrp,
+                                    struct cftype *cft)
+{
+        return clone_children(cgrp);
+}
+static int cgroup_clone_children_write(struct cgroup *cgrp,
+                                     struct cftype *cft,
+                                     u64 val)
+{
+        if (val)
+                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+        else
+                clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+        return 0;
+}
 /*
 * for the common functions, 'private' gives the type of file
 */
@@ -3202,6 +3263,11 @@ static struct cftype files[] = {
                .write_string = cgroup_write_event_control,
                .mode = S_IWUGO,
        },
+        {
+                .name = "cgroup.clone_children",
+                .read_u64 = cgroup_clone_children_read,
+                .write_u64 = cgroup_clone_children_write,
+        },
 };
 static struct cftype cft_release_agent = {
@@ -3331,6 +3397,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        if (clone_children(parent))
+                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
@@ -3345,6 +3414,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                                goto err_destroy;
                }
                /* At error, ->destroy() callback has to free assigned ID. */
+                if (clone_children(parent) && ss->post_clone)
+                        ss->post_clone(ss, cgrp);
        }
        cgroup_lock_hierarchy(root);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-int cgroup_freezing_or_frozen(struct task_struct *task)
+static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
 {
-        struct freezer *freezer;
+        enum freezer_state state = task_freezer(task)->state;
-        enum freezer_state state;
+        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+}
+int cgroup_freezing_or_frozen(struct task_struct *task)
+{
+        int result;
        task_lock(task);
-        freezer = task_freezer(task);
+        result = __cgroup_freezing_or_frozen(task);
-        if (!freezer->css.cgroup->parent)
-                state = CGROUP_THAWED; /* root cgroup can't be frozen */
-        else
-                state = freezer->state;
        task_unlock(task);
+        return result;
-        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
 }
 /*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
        kfree(cgroup_freezer(cgroup));
 }
-/* Task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
-{
-        return frozen(task) ||
-                (task_is_stopped_or_traced(task) && freezing(task));
-}
 /*
 * The call to cgroup_lock() in the freezer.state write method prevents
 * a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        /*
         * Anything frozen can't move or be moved to/from.
-         *
-         * Since orig_freezer->state == FROZEN means that @task has been
-         * frozen, so it's sufficient to check the latter condition.
         */
-        if (is_task_frozen_enough(task))
+        freezer = cgroup_freezer(new_cgroup);
+        if (freezer->state != CGROUP_THAWED)
                return -EBUSY;
-        freezer = cgroup_freezer(new_cgroup);
+        rcu_read_lock();
-        if (freezer->state == CGROUP_FROZEN)
+        if (__cgroup_freezing_or_frozen(task)) {
+                rcu_read_unlock();
                return -EBUSY;
+        }
+        rcu_read_unlock();
        if (threadgroup) {
                struct task_struct *c;
                rcu_read_lock();
                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        if (is_task_frozen_enough(c)) {
+                        if (__cgroup_freezing_or_frozen(c)) {
                                rcu_read_unlock();
                                return -EBUSY;
                        }
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 /*
 * caller must hold freezer->lock
 */
-static void update_freezer_state(struct cgroup *cgroup,
+static void update_if_frozen(struct cgroup *cgroup,
                                 struct freezer *freezer)
 {
        struct cgroup_iter it;
        struct task_struct *task;
        unsigned int nfrozen = 0, ntotal = 0;
+        enum freezer_state old_state = freezer->state;
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
                ntotal++;
-                if (is_task_frozen_enough(task))
+                if (frozen(task))
                        nfrozen++;
        }
-        /*
+        if (old_state == CGROUP_THAWED) {
-         * Transition to FROZEN when no new tasks can be added ensures
+                BUG_ON(nfrozen > 0);
-         * that we never exist in the FROZEN state while there are unfrozen
+        } else if (old_state == CGROUP_FREEZING) {
-         * tasks.
+                if (nfrozen == ntotal)
-         */
+                        freezer->state = CGROUP_FROZEN;
-        if (nfrozen == ntotal)
+        } else { /* old_state == CGROUP_FROZEN */
-                freezer->state = CGROUP_FROZEN;
+                BUG_ON(nfrozen != ntotal);
-        else if (nfrozen > 0)
+        }
-                freezer->state = CGROUP_FREEZING;
-        else
-                freezer->state = CGROUP_THAWED;
        cgroup_iter_end(cgroup, &it);
 }
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
        if (state == CGROUP_FREEZING) {
                /* We change from FREEZING to FROZEN lazily if the cgroup was
                 * only partially frozen when we exitted write. */
-                update_freezer_state(cgroup, freezer);
+                update_if_frozen(cgroup, freezer);
                state = freezer->state;
        }
        spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        while ((task = cgroup_iter_next(cgroup, &it))) {
                if (!freeze_task(task, true))
                        continue;
-                if (is_task_frozen_enough(task))
+                if (frozen(task))
                        continue;
                if (!freezing(task) && !freezer_should_skip(task))
                        num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
        spin_lock_irq(&freezer->lock);
-        update_freezer_state(cgroup, freezer);
+        update_if_frozen(cgroup, freezer);
        if (goal_state == freezer->state)
                goto out;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 51b143e2a07a..4349935c2ad8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
 */
-static int cpuset_get_sb(struct file_system_type *fs_type,
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
-                         int flags, const char *unused_dev_name,
+                         int flags, const char *unused_dev_name, void *data)
-                         void *data, struct vfsmount *mnt)
 {
        struct file_system_type *cgroup_fs = get_fs_type("cgroup");
-        int ret = -ENODEV;
+        struct dentry *ret = ERR_PTR(-ENODEV);
        if (cgroup_fs) {
                char mountopts[] =
                        "cpuset,noprefix,"
                        "release_agent=/sbin/cpuset_release_agent";
-                ret = cgroup_fs->get_sb(cgroup_fs, flags,
+                ret = cgroup_fs->mount(cgroup_fs, flags,
-                                           unused_dev_name, mountopts, mnt);
+                                           unused_dev_name, mountopts);
                put_filesystem(cgroup_fs);
        }
        return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
 static struct file_system_type cpuset_fs_type = {
        .name = "cpuset",
-        .get_sb = cpuset_get_sb,
+        .mount = cpuset_mount,
 };
 /*
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
 /*
 * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_guard_mutex
+ * - The caller must hold ->cred_guard_mutex
 */
 struct cred *prepare_exec_creds(void)
 {
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        struct cred *new;
        int ret;
-        mutex_init(&p->cred_guard_mutex);
        if (
 #ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index fec596da9bd0..cefd4a11f6d9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -209,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
        return 0;
 }
-/**
- *      kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
- *      @regs: Current &struct pt_regs.
- *
- *      This function will be called if the particular architecture must
- *      disable hardware debugging while it is processing gdb packets or
- *      handling exception.
- */
-void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
-{
-}
 /*
 * Some architectures need cache flushes when we set/clear a
 * breakpoint:
@@ -484,7 +472,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
                atomic_inc(&masters_in_kgdb);
        else
                atomic_inc(&slaves_in_kgdb);
-        kgdb_disable_hw_debug(ks->linux_regs);
+        if (arch_kgdb_ops.disable_hw_break)
+                arch_kgdb_ops.disable_hw_break(regs);
 acquirelock:
        /*
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index d7bda21a106b..a6e729766821 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50];
 #define for_each_kdbcmd(cmd, num)                                       \
        for ((cmd) = kdb_base_commands, (num) = 0;                      \
             num < kdb_max_commands;                                    \
-             num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
+             num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
 typedef struct _kdbmsg {
        int     km_diag;        /* kdb diagnostic */
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
        }
        if (!s->usable)
                return KDB_NOTIMP;
-        s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+        s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
        if (!s->command) {
                kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
                           cmdstr);
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                /* special case below */
        } else {
                kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
-                           kdb_current, kdb_current->pid);
+                           kdb_current, kdb_current ? kdb_current->pid : 0);
 #if defined(CONFIG_SMP)
                kdb_printf("on processor %d ", raw_smp_processor_id());
 #endif
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
 */
 static int kdb_ll(int argc, const char **argv)
 {
-        int diag;
+        int diag = 0;
        unsigned long addr;
        long offset = 0;
        unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
                char buf[80];
                if (KDB_FLAG(CMD_INTERRUPT))
-                        return 0;
+                        goto out;
                sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
                diag = kdb_parse(buf);
                if (diag)
-                        return diag;
+                        goto out;
                addr = va + linkoffset;
                if (kdb_getword(&va, addr, sizeof(va)))
-                        return 0;
+                        goto out;
        }
-        kfree(command);
-        return 0;
+out:
+        kfree(command);
+        return diag;
 }
 static int kdb_kgdb(int argc, const char **argv)
@@ -2603,20 +2604,17 @@ static int kdb_summary(int argc, const char **argv)
 */
 static int kdb_per_cpu(int argc, const char **argv)
 {
-        char buf[256], fmtstr[64];
+        char fmtstr[64];
-        kdb_symtab_t symtab;
+        int cpu, diag, nextarg = 1;
-        cpumask_t suppress = CPU_MASK_NONE;
+        unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
-        int cpu, diag;
-        unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
        if (argc < 1 || argc > 3)
                return KDB_ARGCOUNT;
-        snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
+        diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
-        if (!kdbgetsymval(buf, &symtab)) {
+        if (diag)
-                kdb_printf("%s is not a per_cpu variable\n", argv[1]);
+                return diag;
-                return KDB_BADADDR;
-        }
        if (argc >= 2) {
                diag = kdbgetularg(argv[2], &bytesperword);
                if (diag)
@@ -2649,46 +2647,25 @@ static int kdb_per_cpu(int argc, const char **argv)
 #define KDB_PCU(cpu) 0
 #endif
 #endif
        for_each_online_cpu(cpu) {
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
                if (whichcpu != ~0UL && whichcpu != cpu)
                        continue;
-                addr = symtab.sym_start + KDB_PCU(cpu);
+                addr = symaddr + KDB_PCU(cpu);
                diag = kdb_getword(&val, addr, bytesperword);
                if (diag) {
                        kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
                                   "read, diag=%d\n", cpu, addr, diag);
                        continue;
                }
-#ifdef  CONFIG_SMP
-                if (!val) {
-                        cpu_set(cpu, suppress);
-                        continue;
-                }
-#endif  /* CONFIG_SMP */
                kdb_printf("%5d ", cpu);
                kdb_md_line(fmtstr, addr,
                        bytesperword == KDB_WORD_SIZE,
                        1, bytesperword, 1, 1, 0);
        }
-        if (cpus_weight(suppress) == 0)
-                return 0;
-        kdb_printf("Zero suppressed cpu(s):");
-        for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
-             cpu = next_cpu(cpu, suppress)) {
-                kdb_printf(" %d", cpu);
-                if (cpu == num_possible_cpus() - 1 ||
-                    next_cpu(cpu, suppress) != cpu + 1)
-                        continue;
-                while (cpu < num_possible_cpus() &&
-                       next_cpu(cpu, suppress) == cpu + 1)
-                        ++cpu;
-                kdb_printf("-%d", cpu);
-        }
-        kdb_printf("\n");
 #undef KDB_PCU
        return 0;
 }
@@ -2763,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
                }
                if (kdb_commands) {
                        memcpy(new, kdb_commands,
-                               kdb_max_commands * sizeof(*new));
+                          (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
                        kfree(kdb_commands);
                }
                memset(new + kdb_max_commands, 0,
                       kdb_command_extend * sizeof(*new));
                kdb_commands = new;
-                kp = kdb_commands + kdb_max_commands;
+                kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
                kdb_max_commands += kdb_command_extend;
        }
diff --git a/kernel/exit.c b/kernel/exit.c
index e2bdf37f9fde..676149a4ac5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -95,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk)
                sig->tty = NULL;
        } else {
                /*
+                 * This can only happen if the caller is de_thread().
+                 * FIXME: this is the temporary hack, we should teach
+                 * posix-cpu-timers to handle this case correctly.
+                 */
+                if (unlikely(has_group_leader_pid(tsk)))
+                        posix_cpu_timers_exit_group(tsk);
+                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
@@ -687,6 +696,8 @@ static void exit_mm(struct task_struct * tsk)
        enter_lazy_tlb(mm, current);
        /* We don't want this task to be frozen prematurely */
        clear_freeze_flag(tsk);
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_dec(&mm->oom_disable_count);
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
@@ -700,6 +711,8 @@ static void exit_mm(struct task_struct * tsk)
 * space.
 */
 static struct task_struct *find_new_reaper(struct task_struct *father)
+        __releases(&tasklist_lock)
+        __acquires(&tasklist_lock)
 {
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *thread;
@@ -901,6 +914,15 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
+        /*
+         * If do_exit is called because this processes oopsed, it's possible
+         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+         * continuing. Amongst other possible reasons, this is to prevent
+         * mm_release()->clear_child_tid() from writing to a user-controlled
+         * kernel address.
+         */
+        set_fs(USER_DS);
        tracehook_report_exit(&code);
        validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8cc408d..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
+#include <linux/oom.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        mm->cached_hole_size = ~0UL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        atomic_set(&mm->oom_disable_count, 0);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -741,6 +743,8 @@ good_mm:
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_inc(&mm->oom_disable_count);
        tsk->mm = mm;
        tsk->active_mm = mm;
@@ -904,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
+        mutex_init(&sig->cred_guard_mutex);
        return 0;
 }
@@ -1299,8 +1305,13 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-        if (p->mm)
+        if (p->mm) {
+                task_lock(p);
+                if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&p->mm->oom_disable_count);
+                task_unlock(p);
                mmput(p->mm);
+        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
@@ -1693,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                        active_mm = current->active_mm;
                        current->mm = new_mm;
                        current->active_mm = new_mm;
+                        if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+                                atomic_dec(&mm->oom_disable_count);
+                                atomic_inc(&new_mm->oom_disable_count);
+                        }
                        activate_mm(active_mm, new_mm);
                        new_mm = mm;
                }
diff --git a/kernel/futex.c b/kernel/futex.c
index a118bf160e0b..40a8777a27d0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
-                atomic_inc(&key->shared.inode->i_count);
+                ihold(key->shared.inode);
                break;
        case FUT_OFF_MMSHARED:
                atomic_inc(&key->private.mm->mm_count);
@@ -2489,7 +2489,8 @@ void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
-        unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+        unsigned int uninitialized_var(next_pi);
        unsigned long futex_offset;
        int rc;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 06da4dfc339b..a7934ac75e5b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
-        unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+        unsigned int uninitialized_var(next_pi);
        compat_uptr_t uentry, next_uentry, upending;
        compat_long_t futex_offset;
        int rc;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f0afca..e5325825aeb6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = {
        .read           = hw_breakpoint_pmu_read,
 };
-static int __init init_hw_breakpoint(void)
+int __init init_hw_breakpoint(void)
 {
        unsigned int **task_bp_pinned;
        int cpu, err_cpu;
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void)
        return -ENOMEM;
 }
-core_initcall(init_hw_breakpoint);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9d917ff72675..9988d03797f5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
        struct irq_desc *desc = irq_to_desc(irq);
        return desc ? desc->kstat_irqs[cpu] : 0;
 }
+#ifdef CONFIG_GENERIC_HARDIRQS
+unsigned int kstat_irqs(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        int cpu;
+        int sum = 0;
+        if (!desc)
+                return 0;
+        for_each_possible_cpu(cpu)
+                sum += desc->kstat_irqs[cpu];
+        return sum;
+}
+#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 644e8d5fa367..5f92acc5f952 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -324,6 +324,10 @@ void enable_irq(unsigned int irq)
        if (!desc)
                return;
+        if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
+            KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+                return;
        chip_bus_lock(desc);
        raw_spin_lock_irqsave(&desc->lock, flags);
        __enable_irq(desc, irq, false);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 01b1d3a88983..6c8a2a9f8a7b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 static int irq_spurious_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_spurious_proc_show, NULL);
+        return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
 }
 static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f16763ff8481..90f881904bb1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -145,7 +145,9 @@ void irq_work_run(void)
                 * Clear the BUSY bit and return to the free state if
                 * no-one else claimed it meanwhile.
                 */
-                cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+                (void)cmpxchg(&entry->next,
+                              next_flags(NULL, IRQ_WORK_BUSY),
+                              NULL);
        }
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 7be868bf25c6..3b79bd938330 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -39,6 +39,16 @@ struct jump_label_module_entry {
        struct module *mod;
 };
+void jump_label_lock(void)
+{
+        mutex_lock(&jump_label_mutex);
+}
+void jump_label_unlock(void)
+{
+        mutex_unlock(&jump_label_mutex);
+}
 static int jump_label_cmp(const void *a, const void *b)
 {
        const struct jump_entry *jea = a;
@@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
        struct jump_label_module_entry *e_module;
        int count;
-        mutex_lock(&jump_label_mutex);
+        jump_label_lock();
        entry = get_jump_label_entry((jump_label_t)key);
        if (entry) {
                count = entry->nr_entries;
@@ -168,13 +178,14 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
                        count = e_module->nr_entries;
                        iter = e_module->table;
                        while (count--) {
-                                if (kernel_text_address(iter->code))
+                                if (iter->key &&
+                                                kernel_text_address(iter->code))
                                        arch_jump_label_transform(iter, type);
                                iter++;
                        }
                }
        }
-        mutex_unlock(&jump_label_mutex);
+        jump_label_unlock();
 }
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
@@ -231,6 +242,7 @@ out:
 * overlaps with any of the jump label patch addresses. Code
 * that wants to modify kernel text should first verify that
 * it does not overlap with any of the jump label addresses.
+ * Caller must hold jump_label_mutex.
 *
 * returns 1 if there is an overlap, 0 otherwise
 */
@@ -241,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end)
        struct jump_entry *iter_stop = __start___jump_table;
        int conflict = 0;
-        mutex_lock(&jump_label_mutex);
        iter = iter_start;
        while (iter < iter_stop) {
                if (addr_conflict(iter, start, end)) {
@@ -256,10 +267,16 @@ int jump_label_text_reserved(void *start, void *end)
        conflict = module_conflict(start, end);
 #endif
 out:
-        mutex_unlock(&jump_label_mutex);
        return conflict;
 }
+/*
+ * Not all archs need this.
+ */
+void __weak arch_jump_label_text_poke_early(jump_label_t addr)
+{
+}
 static __init int init_jump_label(void)
 {
        int ret;
@@ -267,7 +284,7 @@ static __init int init_jump_label(void)
        struct jump_entry *iter_stop = __stop___jump_table;
        struct jump_entry *iter;
-        mutex_lock(&jump_label_mutex);
+        jump_label_lock();
        ret = build_jump_label_hashtable(__start___jump_table,
                                         __stop___jump_table);
        iter = iter_start;
@@ -275,7 +292,7 @@ static __init int init_jump_label(void)
                arch_jump_label_text_poke_early(iter->code);
                iter++;
        }
-        mutex_unlock(&jump_label_mutex);
+        jump_label_unlock();
        return ret;
 }
 early_initcall(init_jump_label);
@@ -366,6 +383,39 @@ static void remove_jump_label_module(struct module *mod)
        }
 }
+static void remove_jump_label_module_init(struct module *mod)
+{
+        struct hlist_head *head;
+        struct hlist_node *node, *node_next, *module_node, *module_node_next;
+        struct jump_label_entry *e;
+        struct jump_label_module_entry *e_module;
+        struct jump_entry *iter;
+        int i, count;
+        /* if the module doesn't have jump label entries, just return */
+        if (!mod->num_jump_entries)
+                return;
+        for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+                head = &jump_label_table[i];
+                hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+                        hlist_for_each_entry_safe(e_module, module_node,
+                                                  module_node_next,
+                                                  &(e->modules), hlist) {
+                                if (e_module->mod != mod)
+                                        continue;
+                                count = e_module->nr_entries;
+                                iter = e_module->table;
+                                while (count--) {
+                                        if (within_module_init(iter->code, mod))
+                                                iter->key = 0;
+                                        iter++;
+                                }
+                        }
+                }
+        }
+}
 static int
 jump_label_module_notify(struct notifier_block *self, unsigned long val,
                         void *data)
@@ -375,16 +425,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
        switch (val) {
        case MODULE_STATE_COMING:
-                mutex_lock(&jump_label_mutex);
+                jump_label_lock();
                ret = add_jump_label_module(mod);
                if (ret)
                        remove_jump_label_module(mod);
-                mutex_unlock(&jump_label_mutex);
+                jump_label_unlock();
                break;
        case MODULE_STATE_GOING:
-                mutex_lock(&jump_label_mutex);
+                jump_label_lock();
                remove_jump_label_module(mod);
-                mutex_unlock(&jump_label_mutex);
+                jump_label_unlock();
+                break;
+        case MODULE_STATE_LIVE:
+                jump_label_lock();
+                remove_jump_label_module_init(mod);
+                jump_label_unlock();
                break;
        }
        return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                ptr = kmap(page);
                /* Start with a clear page */
-                memset(ptr, 0, PAGE_SIZE);
+                clear_page(ptr);
                ptr += maddr & ~PAGE_MASK;
                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
                if (mchunk > mbytes)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 56a891914273..9737a76e106f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 /* NOTE: change this value only with kprobe_mutex held */
 static bool kprobes_all_disarmed;
-static DEFINE_MUTEX(kprobe_mutex);      /* Protects kprobe_table */
+/* This protects kprobe_table and optimizing_list */
+static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
        spinlock_t lock ____cacheline_aligned_in_smp;
@@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
 }
 #ifdef CONFIG_SYSCTL
+/* This should be called with kprobe_mutex locked */
 static void __kprobes optimize_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
                return;
        kprobes_allow_optimization = true;
-        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
-        mutex_unlock(&text_mutex);
        printk(KERN_INFO "Kprobes globally optimized\n");
 }
+/* This should be called with kprobe_mutex locked */
 static void __kprobes unoptimize_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -1144,14 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p)
        if (ret)
                return ret;
+        jump_label_lock();
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr) ||
            ftrace_text_reserved(p->addr, p->addr) ||
-            jump_label_text_reserved(p->addr, p->addr)) {
+            jump_label_text_reserved(p->addr, p->addr))
-                preempt_enable();
+                goto fail_with_jump_label;
-                return -EINVAL;
-        }
        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
        p->flags &= KPROBE_FLAG_DISABLED;
@@ -1165,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p)
                 * We must hold a refcount of the probed module while updating
                 * its code to prohibit unexpected unloading.
                 */
-                if (unlikely(!try_module_get(probed_mod))) {
+                if (unlikely(!try_module_get(probed_mod)))
-                        preempt_enable();
+                        goto fail_with_jump_label;
-                        return -EINVAL;
-                }
                /*
                 * If the module freed .init.text, we couldn't insert
                 * kprobes in there.
@@ -1176,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p)
                if (within_module_init((unsigned long)p->addr, probed_mod) &&
                    probed_mod->state != MODULE_STATE_COMING) {
                        module_put(probed_mod);
-                        preempt_enable();
+                        goto fail_with_jump_label;
-                        return -EINVAL;
                }
        }
        preempt_enable();
+        jump_label_unlock();
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);
        mutex_lock(&kprobe_mutex);
+        jump_label_lock(); /* needed to call jump_label_text_reserved() */
        get_online_cpus();      /* For avoiding text_mutex deadlock. */
        mutex_lock(&text_mutex);
@@ -1213,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p)
 out:
        mutex_unlock(&text_mutex);
        put_online_cpus();
+        jump_label_unlock();
        mutex_unlock(&kprobe_mutex);
        if (probed_mod)
                module_put(probed_mod);
        return ret;
+fail_with_jump_label:
+        preempt_enable();
+        jump_label_unlock();
+        return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..17110a4a4fc2 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        account_global_scheduler_latency(tsk, &lat);
-        /*
+        for (i = 0; i < tsk->latency_record_count; i++) {
-         * short term hack; if we're > 32 we stop; future we recycle:
-         */
-        tsk->latency_record_count++;
-        if (tsk->latency_record_count >= LT_SAVECOUNT)
-                goto out_unlock;
-        for (i = 0; i < LT_SAVECOUNT; i++) {
                struct latency_record *mylat;
                int same = 1;
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
                }
        }
+        /*
+         * short term hack; if we're > 32 we stop; future we recycle:
+         */
+        if (tsk->latency_record_count >= LT_SAVECOUNT)
+                goto out_unlock;
        /* Allocated a new one: */
-        i = tsk->latency_record_count;
+        i = tsk->latency_record_count++;
        memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 out_unlock:
diff --git a/kernel/module.c b/kernel/module.c
index 2df46301a7a4..d190664f25ff 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
 {
 }
-static void add_kallsyms(struct module *mod, struct load_info *info)
+static void add_kallsyms(struct module *mod, const struct load_info *info)
 {
 }
 #endif /* CONFIG_KALLSYMS */
@@ -2326,6 +2326,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
        kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
                           mod->num_trace_events, GFP_KERNEL);
 #endif
+#ifdef CONFIG_TRACING
+        mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+                                         sizeof(*mod->trace_bprintk_fmt_start),
+                                         &mod->num_trace_bprintk_fmt);
+        /*
+         * This section contains pointers to allocated objects in the trace
+         * code and not scanning it leads to false positives.
+         */
+        kmemleak_scan_area(mod->trace_bprintk_fmt_start,
+                           sizeof(*mod->trace_bprintk_fmt_start) *
+                           mod->num_trace_bprintk_fmt, GFP_KERNEL);
+#endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
        /* sechdrs[0].sh_size is always zero */
        mod->ftrace_callsites = section_objs(info, "__mcount_loc",
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
                return ERR_PTR(-EPERM);
        if (!cgroup_is_descendant(cgroup, current))
                return ERR_PTR(-EPERM);
+        if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
+                printk("ns_cgroup can't be created with parent "
+                       "'clone_children' set.\n");
+                return ERR_PTR(-EINVAL);
+        }
+        printk_once("ns_cgroup deprecated: consider using the "
+                    "'clone_children' flag without the ns_cgroup.\n");
        ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
        if (!ns_cgroup)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f309e8014c78..eac7e3364335 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,6 +31,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/irq_regs.h>
@@ -417,8 +418,8 @@ event_filter_match(struct perf_event *event)
        return event->cpu == -1 || event->cpu == smp_processor_id();
 }
-static int
+static void
-__event_sched_out(struct perf_event *event,
+event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
 {
@@ -437,13 +438,14 @@ __event_sched_out(struct perf_event *event,
        }
        if (event->state != PERF_EVENT_STATE_ACTIVE)
-                return 0;
+                return;
        event->state = PERF_EVENT_STATE_INACTIVE;
        if (event->pending_disable) {
                event->pending_disable = 0;
                event->state = PERF_EVENT_STATE_OFF;
        }
+        event->tstamp_stopped = ctx->time;
        event->pmu->del(event, 0);
        event->oncpu = -1;
@@ -452,19 +454,6 @@ __event_sched_out(struct perf_event *event,
        ctx->nr_active--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
-        return 1;
-}
-static void
-event_sched_out(struct perf_event *event,
-                  struct perf_cpu_context *cpuctx,
-                  struct perf_event_context *ctx)
-{
-        int ret;
-        ret = __event_sched_out(event, cpuctx, ctx);
-        if (ret)
-                event->tstamp_stopped = ctx->time;
 }
 static void
@@ -664,7 +653,7 @@ retry:
 }
 static int
-__event_sched_in(struct perf_event *event,
+event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
                 struct perf_event_context *ctx)
 {
@@ -684,6 +673,10 @@ __event_sched_in(struct perf_event *event,
                return -EAGAIN;
        }
+        event->tstamp_running += ctx->time - event->tstamp_stopped;
+        event->shadow_ctx_time = ctx->time - ctx->timestamp;
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        ctx->nr_active++;
@@ -694,35 +687,6 @@ __event_sched_in(struct perf_event *event,
        return 0;
 }
-static inline int
-event_sched_in(struct perf_event *event,
-                 struct perf_cpu_context *cpuctx,
-                 struct perf_event_context *ctx)
-{
-        int ret = __event_sched_in(event, cpuctx, ctx);
-        if (ret)
-                return ret;
-        event->tstamp_running += ctx->time - event->tstamp_stopped;
-        return 0;
-}
-static void
-group_commit_event_sched_in(struct perf_event *group_event,
-               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx)
-{
-        struct perf_event *event;
-        u64 now = ctx->time;
-        group_event->tstamp_running += now - group_event->tstamp_stopped;
-        /*
-         * Schedule in siblings as one group (if any):
-         */
-        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-                event->tstamp_running += now - event->tstamp_stopped;
-        }
-}
 static int
 group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
@@ -730,19 +694,15 @@ group_sched_in(struct perf_event *group_event,
 {
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = group_event->pmu;
+        u64 now = ctx->time;
+        bool simulate = false;
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
        pmu->start_txn(pmu);
-        /*
+        if (event_sched_in(group_event, cpuctx, ctx)) {
-         * use __event_sched_in() to delay updating tstamp_running
-         * until the transaction is committed. In case of failure
-         * we will keep an unmodified tstamp_running which is a
-         * requirement to get correct timing information
-         */
-        if (__event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
                return -EAGAIN;
        }
@@ -751,31 +711,42 @@ group_sched_in(struct perf_event *group_event,
         * Schedule in siblings as one group (if any):
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-                if (__event_sched_in(event, cpuctx, ctx)) {
+                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }
-        if (!pmu->commit_txn(pmu)) {
+        if (!pmu->commit_txn(pmu))
-                /* commit tstamp_running */
-                group_commit_event_sched_in(group_event, cpuctx, ctx);
                return 0;
-        }
 group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
+         * The events up to the failed event are scheduled out normally,
+         * tstamp_stopped will be updated.
         *
-         * use __event_sched_out() to avoid updating tstamp_stopped
+         * The failed events and the remaining siblings need to have
-         * because the event never actually ran
+         * their timings updated as if they had gone thru event_sched_in()
+         * and event_sched_out(). This is required to get consistent timings
+         * across the group. This also takes care of the case where the group
+         * could never be scheduled by ensuring tstamp_stopped is set to mark
+         * the time the event was actually stopped, such that time delta
+         * calculation in update_event_times() is correct.
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
                if (event == partial_group)
-                        break;
+                        simulate = true;
-                __event_sched_out(event, cpuctx, ctx);
+                if (simulate) {
+                        event->tstamp_running += now - event->tstamp_stopped;
+                        event->tstamp_stopped = now;
+                } else {
+                        event_sched_out(event, cpuctx, ctx);
+                }
        }
-        __event_sched_out(group_event, cpuctx, ctx);
+        event_sched_out(group_event, cpuctx, ctx);
        pmu->cancel_txn(pmu);
@@ -1316,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
        int ctxn;
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
 }
@@ -1651,8 +1620,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
 {
        raw_spin_lock(&ctx->lock);
-        /* Rotate the first entry last of non-pinned groups */
+        /*
-        list_rotate_left(&ctx->flexible_groups);
+         * Rotate the first entry last of non-pinned groups. Rotation might be
+         * disabled by the inheritance code.
+         */
+        if (!ctx->rotate_disable)
+                list_rotate_left(&ctx->flexible_groups);
        raw_spin_unlock(&ctx->lock);
 }
@@ -2264,11 +2237,6 @@ int perf_event_release_kernel(struct perf_event *event)
        raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
-        mutex_lock(&event->owner->perf_event_mutex);
-        list_del_init(&event->owner_entry);
-        mutex_unlock(&event->owner->perf_event_mutex);
-        put_task_struct(event->owner);
        free_event(event);
        return 0;
@@ -2281,9 +2249,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 static int perf_release(struct inode *inode, struct file *file)
 {
        struct perf_event *event = file->private_data;
+        struct task_struct *owner;
        file->private_data = NULL;
+        rcu_read_lock();
+        owner = ACCESS_ONCE(event->owner);
+        /*
+         * Matches the smp_wmb() in perf_event_exit_task(). If we observe
+         * !owner it means the list deletion is complete and we can indeed
+         * free this event, otherwise we need to serialize on
+         * owner->perf_event_mutex.
+         */
+        smp_read_barrier_depends();
+        if (owner) {
+                /*
+                 * Since delayed_put_task_struct() also drops the last
+                 * task reference we can safely take a new reference
+                 * while holding the rcu_read_lock().
+                 */
+                get_task_struct(owner);
+        }
+        rcu_read_unlock();
+        if (owner) {
+                mutex_lock(&owner->perf_event_mutex);
+                /*
+                 * We have to re-check the event->owner field, if it is cleared
+                 * we raced with perf_event_exit_task(), acquiring the mutex
+                 * ensured they're done, and we can proceed with freeing the
+                 * event.
+                 */
+                if (event->owner)
+                        list_del_init(&event->owner_entry);
+                mutex_unlock(&owner->perf_event_mutex);
+                put_task_struct(owner);
+        }
        return perf_event_release_kernel(event);
 }
@@ -3428,7 +3430,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 }
 static void perf_output_read_one(struct perf_output_handle *handle,
-                                 struct perf_event *event)
+                                 struct perf_event *event,
+                                 u64 enabled, u64 running)
 {
        u64 read_format = event->attr.read_format;
        u64 values[4];
@@ -3436,11 +3439,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
        values[n++] = perf_event_count(event);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-                values[n++] = event->total_time_enabled +
+                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-                values[n++] = event->total_time_running +
+                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
@@ -3453,7 +3456,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
 */
 static void perf_output_read_group(struct perf_output_handle *handle,
-                            struct perf_event *event)
+                            struct perf_event *event,
+                            u64 enabled, u64 running)
 {
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
@@ -3463,10 +3467,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        values[n++] = 1 + leader->nr_siblings;
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                values[n++] = leader->total_time_enabled;
+                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-                values[n++] = leader->total_time_running;
+                values[n++] = running;
        if (leader != event)
                leader->pmu->read(leader);
@@ -3491,13 +3495,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        }
 }
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+                                 PERF_FORMAT_TOTAL_TIME_RUNNING)
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
+        u64 enabled = 0, running = 0, now, ctx_time;
+        u64 read_format = event->attr.read_format;
+        /*
+         * compute total_time_enabled, total_time_running
+         * based on snapshot values taken when the event
+         * was last scheduled in.
+         *
+         * we cannot simply called update_context_time()
+         * because of locking issue as we are called in
+         * NMI context
+         */
+        if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+                now = perf_clock();
+                ctx_time = event->shadow_ctx_time + now;
+                enabled = ctx_time - event->tstamp_enabled;
+                running = ctx_time - event->tstamp_running;
+        }
        if (event->attr.read_format & PERF_FORMAT_GROUP)
-                perf_output_read_group(handle, event);
+                perf_output_read_group(handle, event, enabled, running);
        else
-                perf_output_read_one(handle, event);
+                perf_output_read_one(handle, event, enabled, running);
 }
 void perf_output_sample(struct perf_output_handle *handle,
@@ -5683,7 +5709,7 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_unlock(&ctx->mutex);
        event->owner = current;
-        get_task_struct(current);
        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
@@ -5751,12 +5777,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        ++ctx->generation;
        mutex_unlock(&ctx->mutex);
-        event->owner = current;
-        get_task_struct(current);
-        mutex_lock(&current->perf_event_mutex);
-        list_add_tail(&event->owner_entry, &current->perf_event_list);
-        mutex_unlock(&current->perf_event_mutex);
        return event;
 err_free:
@@ -5907,8 +5927,24 @@ again:
 */
 void perf_event_exit_task(struct task_struct *child)
 {
+        struct perf_event *event, *tmp;
        int ctxn;
+        mutex_lock(&child->perf_event_mutex);
+        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+                                 owner_entry) {
+                list_del_init(&event->owner_entry);
+                /*
+                 * Ensure the list deletion is visible before we clear
+                 * the owner, closes a race against perf_release() where
+                 * we need to serialize on the owner->perf_event_mutex.
+                 */
+                smp_wmb();
+                event->owner = NULL;
+        }
+        mutex_unlock(&child->perf_event_mutex);
        for_each_task_context_nr(ctxn)
                perf_event_exit_task_context(child, ctxn);
 }
@@ -6128,6 +6164,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
+        unsigned long flags;
        int ret = 0;
        child->perf_event_ctxp[ctxn] = NULL;
@@ -6168,6 +6205,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                        break;
        }
+        /*
+         * We can't hold ctx->lock when iterating the ->flexible_group list due
+         * to allocations, but we need to prevent rotation because
+         * rotate_ctx() will change the list from interrupt context.
+         */
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 1;
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
@@ -6175,6 +6221,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                        break;
        }
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 0;
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        child_ctx = child->perf_event_ctxp[ctxn];
        if (child_ctx && inherited_all) {
@@ -6327,6 +6377,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 void __init perf_event_init(void)
 {
+        int ret;
        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent);
@@ -6334,4 +6386,7 @@ void __init perf_event_init(void)
        perf_pmu_register(&perf_task_clock);
        perf_tp_register();
        perf_cpu_notifier(perf_cpu_notify);
+        ret = init_hw_breakpoint();
+        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 }
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index c7a8f453919e..aeaa7f846821 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -121,10 +121,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        switch (o->type) {
        case PM_QOS_MIN:
-                return plist_last(&o->requests)->prio;
+                return plist_first(&o->requests)->prio;
        case PM_QOS_MAX:
-                return plist_first(&o->requests)->prio;
+                return plist_last(&o->requests)->prio;
        default:
                /* runtime check for not using enum */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..05bb7173850e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
        if (pid == 0)
                return 0;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
-                   same_thread_group(p, current) : thread_group_leader(p))) {
+                   same_thread_group(p, current) : has_group_leader_pid(p))) {
                error = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
                if (pid == 0) {
                        p = current;
@@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                        p = current->group_leader;
                } else {
                        p = find_task_by_vpid(pid);
-                        if (p && !thread_group_leader(p))
+                        if (p && !has_group_leader_pid(p))
                                p = NULL;
                }
        }
@@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
        } else {
                ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 29bff6117abc..a5aff3ebad38 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -246,9 +246,13 @@ config PM_OPS
        depends on PM_SLEEP || PM_RUNTIME
        default y
+config ARCH_HAS_OPP
+        bool
 config PM_OPP
        bool "Operating Performance Point (OPP) Layer library"
        depends on PM
+        depends on ARCH_HAS_OPP
        ---help---
          SOCs have a standard set of tuples consisting of frequency and
          voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..048d0b514831 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -327,7 +327,6 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
@@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode)
                goto Recover_platform;
        error = create_image(platform_mode);
-        /* Control returns here after successful restore */
+        /*
+         * Control returns here (1) after the image has been created or the
+         * image creation has failed and (2) after a successful restore.
+         */
 Resume_devices:
        /* We may need to release the preallocated image pages here. */
@@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode)
        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
+        if (error || !in_suspend)
+                pm_restore_gfp_mask();
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
        pm_restore_console();
        return error;
@@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
        int error;
-        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -492,7 +495,6 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -536,7 +538,6 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
@@ -646,6 +647,7 @@ int hibernate(void)
                swsusp_free();
                if (!error)
                        power_down();
+                pm_restore_gfp_mask();
        } else {
                pr_debug("PM: Image restored successfully.\n");
        }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ac7eb109f196..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
                src = kmap_atomic(s_page, KM_USER0);
                dst = kmap_atomic(d_page, KM_USER1);
                do_copy_page(dst, src);
-                kunmap_atomic(src, KM_USER0);
                kunmap_atomic(dst, KM_USER1);
+                kunmap_atomic(src, KM_USER0);
        } else {
                if (PageHighMem(d_page)) {
                        /* Page pointed to by src may contain some kernel
@@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
                         */
                        safe_copy_page(buffer, s_page);
                        dst = kmap_atomic(d_page, KM_USER0);
-                        memcpy(dst, buffer, PAGE_SIZE);
+                        copy_page(dst, buffer);
                        kunmap_atomic(dst, KM_USER0);
                } else {
                        safe_copy_page(page_address(d_page), s_page);
@@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
                memory_bm_position_reset(&orig_bm);
                memory_bm_position_reset(&copy_bm);
        } else if (handle->cur <= nr_meta_pages) {
-                memset(buffer, 0, PAGE_SIZE);
+                clear_page(buffer);
                pack_pfns(buffer, &orig_bm);
        } else {
                struct page *page;
@@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
                        void *kaddr;
                        kaddr = kmap_atomic(page, KM_USER0);
-                        memcpy(buffer, kaddr, PAGE_SIZE);
+                        copy_page(buffer, kaddr);
                        kunmap_atomic(kaddr, KM_USER0);
                        handle->buffer = buffer;
                } else {
@@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void)
                void *dst;
                dst = kmap_atomic(last_highmem_page, KM_USER0);
-                memcpy(dst, buffer, PAGE_SIZE);
+                copy_page(dst, buffer);
                kunmap_atomic(dst, KM_USER0);
                last_highmem_page = NULL;
        }
@@ -2270,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
        kaddr1 = kmap_atomic(p1, KM_USER0);
        kaddr2 = kmap_atomic(p2, KM_USER1);
-        memcpy(buf, kaddr1, PAGE_SIZE);
+        copy_page(buf, kaddr1);
-        memcpy(kaddr1, kaddr2, PAGE_SIZE);
+        copy_page(kaddr1, kaddr2);
-        memcpy(kaddr2, buf, PAGE_SIZE);
+        copy_page(kaddr2, buf);
-        kunmap_atomic(kaddr1, KM_USER0);
        kunmap_atomic(kaddr2, KM_USER1);
+        kunmap_atomic(kaddr1, KM_USER0);
 }
 /**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..ecf770509d0d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
-        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
@@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 916eaa790399..baf667bb2794 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
 *
 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 *
@@ -251,7 +252,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
        if (bio_chain) {
                src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
                if (src) {
-                        memcpy(src, buf, PAGE_SIZE);
+                        copy_page(src, buf);
                } else {
                        WARN_ON_ONCE(1);
                        bio_chain = NULL;       /* Go synchronous */
@@ -325,7 +326,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                error = write_page(handle->cur, handle->cur_swap, NULL);
                if (error)
                        goto out;
-                memset(handle->cur, 0, PAGE_SIZE);
+                clear_page(handle->cur);
                handle->cur_swap = offset;
                handle->k = 0;
        }
@@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle,
 {
        unsigned int m;
        int error = 0;
+        struct bio *bio;
        struct timeval start;
        struct timeval stop;
        unsigned nr_pages;
-        size_t off, unc_len, cmp_len;
+        size_t i, off, unc_len, cmp_len;
-        unsigned char *unc, *cmp, *page;
+        unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
-        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        for (i = 0; i < LZO_CMP_PAGES; i++) {
-        if (!page) {
+                page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
-                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                if (!page[i]) {
-                return -ENOMEM;
+                        printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                        while (i)
+                                free_page((unsigned long)page[--i]);
+                        return -ENOMEM;
+                }
        }
        unc = vmalloc(LZO_UNC_SIZE);
        if (!unc) {
                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
-                free_page((unsigned long)page);
+                for (i = 0; i < LZO_CMP_PAGES; i++)
+                        free_page((unsigned long)page[i]);
                return -ENOMEM;
        }
        cmp = vmalloc(LZO_CMP_SIZE);
        if (!cmp) {
                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
                vfree(unc);
-                free_page((unsigned long)page);
+                for (i = 0; i < LZO_CMP_PAGES; i++)
+                        free_page((unsigned long)page[i]);
                return -ENOMEM;
        }
@@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
        if (!m)
                m = 1;
        nr_pages = 0;
+        bio = NULL;
        do_gettimeofday(&start);
        error = snapshot_write_next(snapshot);
@@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle,
                goto out_finish;
        for (;;) {
-                error = swap_read_page(handle, page, NULL); /* sync */
+                error = swap_read_page(handle, page[0], NULL); /* sync */
                if (error)
                        break;
-                cmp_len = *(size_t *)page;
+                cmp_len = *(size_t *)page[0];
                if (unlikely(!cmp_len ||
                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
@@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle,
                        break;
                }
-                memcpy(cmp, page, PAGE_SIZE);
+                for (off = PAGE_SIZE, i = 1;
-                for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
-                        error = swap_read_page(handle, page, NULL); /* sync */
+                        error = swap_read_page(handle, page[i], &bio);
                        if (error)
                                goto out_finish;
+                }
-                        memcpy(cmp + off, page, PAGE_SIZE);
+                error = hib_wait_on_bio_chain(&bio); /* need all data now */
+                if (error)
+                        goto out_finish;
+                for (off = 0, i = 0;
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
+                        memcpy(cmp + off, page[i], PAGE_SIZE);
                }
                unc_len = LZO_UNC_SIZE;
@@ -857,7 +879,8 @@ out_finish:
        vfree(cmp);
        vfree(unc);
-        free_page((unsigned long)page);
+        for (i = 0; i < LZO_CMP_PAGES; i++)
+                free_page((unsigned long)page[i]);
        return error;
 }
@@ -910,7 +933,7 @@ int swsusp_check(void)
        hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
        if (!IS_ERR(hib_resume_bdev)) {
                set_blocksize(hib_resume_bdev, PAGE_SIZE);
-                memset(swsusp_header, 0, PAGE_SIZE);
+                clear_page(swsusp_header);
                error = hib_bio_read_page(swsusp_resume_block,
                                        swsusp_header, NULL);
                if (error)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..1b2ea31e6bd8 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen || data->ready)
                        break;
+                pm_restore_gfp_mask();
                thaw_processes();
                usermodehelper_enable();
                data->frozen = 0;
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        error = -EPERM;
                        break;
                }
+                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
                if (!error)
                        error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index 2531017795f6..a23315dc4498 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
 #ifdef CONFIG_BOOT_PRINTK_DELAY
-static unsigned int boot_delay; /* msecs delay after each printk during bootup */
+static int boot_delay; /* msecs delay after each printk during bootup */
 static unsigned long long loops_per_msec;       /* based on boot_delay */
 static int __init boot_delay_setup(char *str)
@@ -261,6 +261,12 @@ static inline void boot_delay_msec(void)
 }
 #endif
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
        unsigned i, j, limit, count;
@@ -268,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
        char c;
        int error = 0;
-        error = security_syslog(type, from_file);
+        /*
+         * If this is from /proc/kmsg we only do the capabilities checks
+         * at open time.
+         */
+        if (type == SYSLOG_ACTION_OPEN || !from_file) {
+                if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if ((type != SYSLOG_ACTION_READ_ALL &&
+                     type != SYSLOG_ACTION_SIZE_BUFFER) &&
+                    !capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+        }
+        error = security_syslog(type);
        if (error)
                return error;
@@ -647,6 +666,7 @@ static inline int can_use_console(unsigned int cpu)
 * released but interrupts still disabled.
 */
 static int acquire_console_semaphore_for_printk(unsigned int cpu)
+        __releases(&logbuf_lock)
 {
        int retval = 0;
@@ -1062,13 +1082,15 @@ void printk_tick(void)
 int printk_needs_cpu(int cpu)
 {
+        if (unlikely(cpu_is_offline(cpu)))
+                printk_tick();
        return per_cpu(printk_pending, cpu);
 }
 void wake_up_klogd(void)
 {
        if (waitqueue_active(&log_wait))
-                __raw_get_cpu_var(printk_pending) = 1;
+                this_cpu_write(printk_pending, 1);
 }
 /**
@@ -1511,7 +1533,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static const char const *kmsg_reasons[] = {
+static const char * const kmsg_reasons[] = {
        [KMSG_DUMP_OOPS]        = "oops",
        [KMSG_DUMP_PANIC]       = "panic",
        [KMSG_DUMP_KEXEC]       = "kexec",
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
         * under ptrace.
         */
        retval = -ERESTARTNOINTR;
-        if (mutex_lock_interruptible(&task->cred_guard_mutex))
+        if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
                goto out;
        task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
 unlock_tasklist:
        write_unlock_irq(&tasklist_lock);
 unlock_creds:
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
 out:
        return retval;
 }
@@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 * and reacquire the lock.
 */
 void exit_ptrace(struct task_struct *tracer)
+        __releases(&tasklist_lock)
+        __acquires(&tasklist_lock)
 {
        struct task_struct *p, *n;
        LIST_HEAD(ptrace_dead);
@@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
        return copied;
 }
-static int ptrace_setoptions(struct task_struct *child, long data)
+static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
        child->ptrace &= ~PT_TRACE_MASK;
@@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
 #define is_sysemu_singlestep(request)   0
 #endif
-static int ptrace_resume(struct task_struct *child, long request, long data)
+static int ptrace_resume(struct task_struct *child, long request,
+                         unsigned long data)
 {
        if (!valid_signal(data))
                return -EIO;
@@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
 #endif
 int ptrace_request(struct task_struct *child, long request,
-                   long addr, long data)
+                   unsigned long addr, unsigned long data)
 {
        int ret = -EIO;
        siginfo_t siginfo;
+        void __user *datavp = (void __user *) data;
+        unsigned long __user *datalp = datavp;
        switch (request) {
        case PTRACE_PEEKTEXT:
@@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
                ret = ptrace_setoptions(child, data);
                break;
        case PTRACE_GETEVENTMSG:
-                ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+                ret = put_user(child->ptrace_message, datalp);
                break;
        case PTRACE_GETSIGINFO:
                ret = ptrace_getsiginfo(child, &siginfo);
                if (!ret)
-                        ret = copy_siginfo_to_user((siginfo_t __user *) data,
+                        ret = copy_siginfo_to_user(datavp, &siginfo);
-                                                   &siginfo);
                break;
        case PTRACE_SETSIGINFO:
-                if (copy_from_user(&siginfo, (siginfo_t __user *) data,
+                if (copy_from_user(&siginfo, datavp, sizeof siginfo))
-                                   sizeof siginfo))
                        ret = -EFAULT;
                else
                        ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
                }
                mmput(mm);
-                ret = put_user(tmp, (unsigned long __user *) data);
+                ret = put_user(tmp, datalp);
                break;
        }
 #endif
@@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
        case PTRACE_SETREGSET:
        {
                struct iovec kiov;
-                struct iovec __user *uiov = (struct iovec __user *) data;
+                struct iovec __user *uiov = datavp;
                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
                        return -EFAULT;
@@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
 #define arch_ptrace_attach(child)       do { } while (0)
 #endif
-SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
+SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
+                unsigned long, data)
 {
        struct task_struct *child;
        long ret;
@@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
        return ret;
 }
-int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
+                            unsigned long data)
 {
        unsigned long tmp;
        int copied;
@@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
        return put_user(tmp, (unsigned long __user *)data);
 }
-int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
+                            unsigned long data)
 {
        int copied;
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
 int clean_sort_range(struct range *range, int az)
 {
-        int i, j, k = az - 1, nr_range = 0;
+        int i, j, k = az - 1, nr_range = az;
        for (i = 0; i < k; i++) {
                if (range[i].end)
diff --git a/kernel/relay.c b/kernel/relay.c
index c7cf397fb929..859ea5a9605f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
 */
 static struct page **relay_alloc_page_array(unsigned int n_pages)
 {
-        struct page **array;
+        const size_t pa_size = n_pages * sizeof(struct page *);
-        size_t pa_size = n_pages * sizeof(struct page *);
+        if (pa_size > PAGE_SIZE)
+                return vzalloc(pa_size);
-        if (pa_size > PAGE_SIZE) {
+        return kzalloc(pa_size, GFP_KERNEL);
-                array = vmalloc(pa_size);
-                if (array)
-                        memset(array, 0, pa_size);
-        } else {
-                array = kzalloc(pa_size, GFP_KERNEL);
-        }
-        return array;
 }
 /*
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..9fad33efd0db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);
 static DEFINE_RWLOCK(resource_lock);
+/*
+ * By default, we allocate free space bottom-up.  The architecture can request
+ * top-down by clearing this flag.  The user can override the architecture's
+ * choice with the "resource_alloc_from_bottom" kernel boot option, but that
+ * should only be a debugging tool.
+ */
+int resource_alloc_from_bottom = 1;
+static __init int setup_alloc_from_bottom(char *s)
+{
+        printk(KERN_INFO
+               "resource: allocating from bottom-up; please report a bug\n");
+        resource_alloc_from_bottom = 1;
+        return 0;
+}
+early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)
        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
 }
+static resource_size_t simple_align_resource(void *data,
+                                             const struct resource *avail,
+                                             resource_size_t size,
+                                             resource_size_t align)
+{
+        return avail->start;
+}
+static void resource_clip(struct resource *res, resource_size_t min,
+                          resource_size_t max)
+{
+        if (res->start < min)
+                res->start = min;
+        if (res->end > max)
+                res->end = max;
+}
+static bool resource_contains(struct resource *res1, struct resource *res2)
+{
+        return res1->start <= res2->start && res1->end >= res2->end;
+}
+/*
+ * Find the resource before "child" in the sibling list of "root" children.
+ */
+static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
+{
+        struct resource *this;
+        for (this = root->child; this; this = this->sibling)
+                if (this->sibling == child)
+                        return this;
+        return NULL;
+}
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ * This version allocates from the end of the root resource first.
+ */
+static int find_resource_from_top(struct resource *root, struct resource *new,
+                                  resource_size_t size, resource_size_t min,
+                                  resource_size_t max, resource_size_t align,
+                                  resource_size_t (*alignf)(void *,
+                                                   const struct resource *,
+                                                   resource_size_t,
+                                                   resource_size_t),
+                                  void *alignf_data)
+{
+        struct resource *this;
+        struct resource tmp, avail, alloc;
+        tmp.start = root->end;
+        tmp.end = root->end;
+        this = find_sibling_prev(root, NULL);
+        for (;;) {
+                if (this) {
+                        if (this->end < root->end)
+                                tmp.start = this->end + 1;
+                } else
+                        tmp.start = root->start;
+                resource_clip(&tmp, min, max);
+                /* Check for overflow after ALIGN() */
+                avail = *new;
+                avail.start = ALIGN(tmp.start, align);
+                avail.end = tmp.end;
+                if (avail.start >= tmp.start) {
+                        alloc.start = alignf(alignf_data, &avail, size, align);
+                        alloc.end = alloc.start + size - 1;
+                        if (resource_contains(&avail, &alloc)) {
+                                new->start = alloc.start;
+                                new->end = alloc.end;
+                                return 0;
+                        }
+                }
+                if (!this || this->start == root->start)
+                        break;
+                tmp.end = this->start - 1;
+                this = find_sibling_prev(root, this);
+        }
+        return -EBUSY;
+}
 /*
 * Find empty slot in the resource tree given range and alignment.
+ * This version allocates from the beginning of the root resource first.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,
                         void *alignf_data)
 {
        struct resource *this = root->child;
-        struct resource tmp = *new;
+        struct resource tmp = *new, avail, alloc;
        tmp.start = root->start;
        /*
-         * Skip past an allocated resource that starts at 0, since the assignment
+         * Skip past an allocated resource that starts at 0, since the
-         * of this->start - 1 to tmp->end below would cause an underflow.
+         * assignment of this->start - 1 to tmp->end below would cause an
+         * underflow.
         */
        if (this && this->start == 0) {
                tmp.start = this->end + 1;
                this = this->sibling;
        }
-        for(;;) {
+        for (;;) {
                if (this)
                        tmp.end = this->start - 1;
                else
                        tmp.end = root->end;
-                if (tmp.start < min)
-                        tmp.start = min;
+                resource_clip(&tmp, min, max);
-                if (tmp.end > max)
-                        tmp.end = max;
+                /* Check for overflow after ALIGN() */
-                tmp.start = ALIGN(tmp.start, align);
+                avail = *new;
-                if (alignf)
+                avail.start = ALIGN(tmp.start, align);
-                        tmp.start = alignf(alignf_data, &tmp, size, align);
+                avail.end = tmp.end;
-                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
+                if (avail.start >= tmp.start) {
-                        new->start = tmp.start;
+                        alloc.start = alignf(alignf_data, &avail, size, align);
-                        new->end = tmp.start + size - 1;
+                        alloc.end = alloc.start + size - 1;
-                        return 0;
+                        if (resource_contains(&avail, &alloc)) {
+                                new->start = alloc.start;
+                                new->end = alloc.end;
+                                return 0;
+                        }
                }
                if (!this)
                        break;
                tmp.start = this->end + 1;
                this = this->sibling;
        }
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,
 {
        int err;
+        if (!alignf)
+                alignf = simple_align_resource;
        write_lock(&resource_lock);
-        err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        if (resource_alloc_from_bottom)
+                err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        else
+                err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
                if (first == parent)
                        return first;
+                if (WARN_ON(first == new))      /* duplicated insertion */
+                        return first;
                if ((first->start > new->start) || (first->end < new->end))
                        break;
diff --git a/kernel/sched.c b/kernel/sched.c
index d42992bccdfa..dc91a4d09ac3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -560,18 +560,8 @@ struct rq {
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
-        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-        /*
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-         * A queue event has occurred, and we're going to schedule.  In
-         * this case, we can save a useless back to back clock update.
-         */
-        if (test_tsk_need_resched(p))
-                rq->skip_clock_update = 1;
-}
 static inline int cpu_of(struct rq *rq)
 {
@@ -2118,6 +2108,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                p->sched_class->prio_changed(rq, p, oldprio, running);
 }
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+        const struct sched_class *class;
+        if (p->sched_class == rq->curr->sched_class) {
+                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+        } else {
+                for_each_class(class) {
+                        if (class == rq->curr->sched_class)
+                                break;
+                        if (class == p->sched_class) {
+                                resched_task(rq->curr);
+                                break;
+                        }
+                }
+        }
+        /*
+         * A queue event has occurred, and we're going to schedule.  In
+         * this case, we can save a useless back to back clock update.
+         */
+        if (test_tsk_need_resched(rq->curr))
+                rq->skip_clock_update = 1;
+}
 #ifdef CONFIG_SMP
 /*
 * Is this task likely cache-hot:
@@ -6960,6 +6975,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (cpu != group_first_cpu(sd->groups))
                return;
+        sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
        child = sd->child;
        sd->groups->cpu_power = 0;
@@ -8510,12 +8527,12 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->put_prev_task(rq, tsk);
-        set_task_rq(tsk, task_cpu(tsk));
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        if (tsk->sched_class->moved_group)
+        if (tsk->sched_class->task_move_group)
-                tsk->sched_class->moved_group(tsk, on_rq);
+                tsk->sched_class->task_move_group(tsk, on_rq);
+        else
 #endif
+                set_task_rq(tsk, task_cpu(tsk));
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 933f3d1b62ea..00ebd7686676 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1654,12 +1654,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        if (unlikely(rt_prio(p->prio)))
-                goto preempt;
-        if (unlikely(p->sched_class != &fair_sched_class))
-                return;
        if (unlikely(se == pse))
                return;
@@ -1764,10 +1758,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
        check_preempt_curr(this_rq, p, 0);
-        /* re-arm NEWIDLE balancing when moving tasks */
-        src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
-        this_rq->idle_stamp = 0;
 }
 /*
@@ -2035,13 +2025,16 @@ struct sd_lb_stats {
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
        unsigned long this_has_capacity;
+        unsigned int  this_idle_cpus;
        /* Statistics of the busiest group */
+        unsigned int  busiest_idle_cpus;
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
        unsigned long busiest_has_capacity;
+        unsigned int  busiest_group_weight;
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2056,8 @@ struct sg_lb_stats {
        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
+        unsigned long idle_cpus;
+        unsigned long group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
 };
@@ -2431,7 +2426,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                sgs->group_load += load;
                sgs->sum_nr_running += rq->nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);
+                if (idle_cpu(i))
+                        sgs->idle_cpus++;
        }
        /*
@@ -2469,6 +2465,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
+        sgs->group_weight = group->group_weight;
        if (sgs->group_capacity > sgs->sum_nr_running)
                sgs->group_has_capacity = 1;
@@ -2576,13 +2573,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
                        sds->this_has_capacity = sgs.group_has_capacity;
+                        sds->this_idle_cpus = sgs.idle_cpus;
                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_idle_cpus = sgs.idle_cpus;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
                        sds->busiest_has_capacity = sgs.group_has_capacity;
+                        sds->busiest_group_weight = sgs.group_weight;
                        sds->group_imb = sgs.group_imb;
                }
@@ -2860,8 +2860,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+        /*
-                goto out_balanced;
+         * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+         * And to check for busy balance use !idle_cpu instead of
+         * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+         * even when they are idle.
+         */
+        if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                        goto out_balanced;
+        } else {
+                /*
+                 * This cpu is idle. If the busiest group load doesn't
+                 * have more tasks than the number of available cpu's and
+                 * there is no imbalance between this and busiest group
+                 * wrt to idle cpu's, it is balanced.
+                 */
+                if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                    sds.busiest_nr_running <= sds.busiest_group_weight)
+                        goto out_balanced;
+        }
 force_balance:
        /* Looks like there is an imbalance. Compute it */
@@ -3197,8 +3215,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task)
+                if (pulled_task) {
+                        this_rq->idle_stamp = 0;
                        break;
+                }
        }
        raw_spin_lock(&this_rq->lock);
@@ -3869,13 +3889,26 @@ static void set_curr_task_fair(struct rq *rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        /*
+         * If the task was not on the rq at the time of this cgroup movement
-        update_curr(cfs_rq);
+         * it must have been asleep, sleeping tasks keep their ->vruntime
+         * absolute on their old rq until wakeup (needed for the fair sleeper
+         * bonus in place_entity()).
+         *
+         * If it was on the rq, we've just 'preempted' it, which does convert
+         * ->vruntime to a relative base.
+         *
+         * Make sure both cases convert their relative position when migrating
+         * to another cgroup's rq. This does somewhat interfere with the
+         * fair sleeper stuff for the first placement, but who cares.
+         */
+        if (!on_rq)
+                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+        set_task_rq(p, task_cpu(p));
        if (!on_rq)
-                place_entity(cfs_rq, &p->se, 1);
+                p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
 #endif
@@ -3927,7 +3960,7 @@ static const struct sched_class fair_sched_class = {
        .get_rr_interval        = get_rr_interval_fair,
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        .moved_group            = moved_group_fair,
+        .task_move_group        = task_move_group_fair,
 #endif
 };
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..48ddf431db0e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 }
 /*
- * Called when a process is dequeued from the active array and given
+ * We are interested in knowing how long it was from the *first* time a
- * the cpu.  We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue.  (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * Though we are interested in knowing how long it was from the *first* time a
 * task was queued to the time that it finally hit a cpu, we call this routine
 * from dequeue_task() to account for possible rq->clock skew across cpus. The
 * delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
 }
 /*
- * Called when a process is queued into either the active or expired
- * array.  The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu.  Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
- *
 * This function is only called from enqueue_task(), but also only updates
 * the timestamp if it is already not set.  It's assumed that
 * sched_info_dequeued() will clear that stamp when appropriate.
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 45bddc0c1048..2bf6b47058c1 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p,
 static void
 check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-        resched_task(rq->curr); /* we preempt everything */
+        /* we're never preempted */
 }
 static struct task_struct *pick_next_task_stop(struct rq *rq)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->state == TASK_RUNNING)
+        if (stop && stop->se.on_rq)
                return stop;
        return NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index 919562c3d6b7..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
        return count;
 }
-struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+                                           unsigned long *flags)
 {
        struct sighand_struct *sighand;
@@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
 * is gone, we keep current->exit_code unless clear_code.
 */
 static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+        __releases(&current->sighand->siglock)
+        __acquires(&current->sighand->siglock)
 {
        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
diff --git a/kernel/smp.c b/kernel/smp.c
index ed6aacfcb7ef..12ed8b013e2d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 *
 * Returns 0 on success, else a negative status code.
 */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
 {
        struct call_single_data d = {
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);
 *      3) any other online cpu in @mask
 */
 int smp_call_function_any(const struct cpumask *mask,
-                          void (*func)(void *info), void *info, int wait)
+                          smp_call_func_t func, void *info, int wait)
 {
        unsigned int cpu;
        const struct cpumask *nodemask;
@@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 * must be disabled when calling this function.
 */
 void smp_call_function_many(const struct cpumask *mask,
-                            void (*func)(void *), void *info, bool wait)
+                            smp_call_func_t func, void *info, bool wait)
 {
        struct call_function_data *data;
        unsigned long flags;
@@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
-int smp_call_function(void (*func)(void *), void *info, int wait)
+int smp_call_function(smp_call_func_t func, void *info, int wait)
 {
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f02a9dfa19bc..18f4be0d5fe0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -229,18 +229,20 @@ restart:
        do {
                if (pending & 1) {
+                        unsigned int vec_nr = h - softirq_vec;
                        int prev_count = preempt_count();
-                        kstat_incr_softirqs_this_cpu(h - softirq_vec);
-                        trace_softirq_entry(h, softirq_vec);
+                        kstat_incr_softirqs_this_cpu(vec_nr);
+                        trace_softirq_entry(vec_nr);
                        h->action(h);
-                        trace_softirq_exit(h, softirq_vec);
+                        trace_softirq_exit(vec_nr);
                        if (unlikely(prev_count != preempt_count())) {
-                                printk(KERN_ERR "huh, entered softirq %td %s %p"
+                                printk(KERN_ERR "huh, entered softirq %u %s %p"
                                       "with preempt_count %08x,"
-                                       " exited with %08x?\n", h - softirq_vec,
+                                       " exited with %08x?\n", vec_nr,
-                                       softirq_to_name[h - softirq_vec],
+                                       softirq_to_name[vec_nr], h->action,
-                                       h->action, prev_count, preempt_count());
+                                       prev_count, preempt_count());
                                preempt_count() = prev_count;
                        }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 090c28812ce1..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ repeat:
                cpu_stop_fn_t fn = work->fn;
                void *arg = work->arg;
                struct cpu_stop_done *done = work->done;
-                char ksym_buf[KSYM_NAME_LEN];
+                char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
                __set_current_state(TASK_RUNNING);
@@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
                p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
                                   cpu);
                if (IS_ERR(p))
-                        return NOTIFY_BAD;
+                        return notifier_from_errno(PTR_ERR(p));
                get_task_struct(p);
                kthread_bind(p, cpu);
                sched_set_stop_task(cpu, p);
@@ -372,7 +372,7 @@ static int __init cpu_stop_init(void)
        /* start one for the boot cpu */
        err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
                                    bcpu);
-        BUG_ON(err == NOTIFY_BAD);
+        BUG_ON(err != NOTIFY_OK);
        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
        register_cpu_notifier(&cpu_stop_cpu_notifier);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..5abfa1518554 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,8 +161,6 @@ extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
-extern struct ratelimit_state printk_ratelimit_state;
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -704,6 +702,15 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &ten_thousand,
        },
+        {
+                .procname       = "dmesg_restrict",
+                .data           = &dmesg_restrict,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
 #endif
        {
                .procname       = "ngroups_max",
@@ -1340,28 +1347,28 @@ static struct ctl_table fs_table[] = {
                .data           = &inodes_stat,
                .maxlen         = 2*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_inodes,
        },
        {
                .procname       = "inode-state",
                .data           = &inodes_stat,
                .maxlen         = 7*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_inodes,
        },
        {
                .procname       = "file-nr",
                .data           = &files_stat,
-                .maxlen         = 3*sizeof(int),
+                .maxlen         = sizeof(files_stat),
                .mode           = 0444,
                .proc_handler   = proc_nr_files,
        },
        {
                .procname       = "file-max",
                .data           = &files_stat.max_files,
-                .maxlen         = sizeof(int),
+                .maxlen         = sizeof(files_stat.max_files),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_doulongvec_minmax,
        },
        {
                .procname       = "nr_open",
@@ -1377,7 +1384,7 @@ static struct ctl_table fs_table[] = {
                .data           = &dentry_stat,
                .maxlen         = 6*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_dentry,
        },
        {
                .procname       = "overflowuid",
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..c8231fb15708 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
        up_write(&listeners->sem);
 }
-static int fill_pid(pid_t pid, struct task_struct *tsk,
+static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
-                struct taskstats *stats)
 {
-        int rc = 0;
-        if (!tsk) {
-                rcu_read_lock();
-                tsk = find_task_by_vpid(pid);
-                if (tsk)
-                        get_task_struct(tsk);
-                rcu_read_unlock();
-                if (!tsk)
-                        return -ESRCH;
-        } else
-                get_task_struct(tsk);
        memset(stats, 0, sizeof(*stats));
        /*
         * Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
        /* fill in extended acct fields */
        xacct_add_tsk(stats, tsk);
+}
-        /* Define err: label here if needed */
+static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
-        put_task_struct(tsk);
+{
-        return rc;
+        struct task_struct *tsk;
+        rcu_read_lock();
+        tsk = find_task_by_vpid(pid);
+        if (tsk)
+                get_task_struct(tsk);
+        rcu_read_unlock();
+        if (!tsk)
+                return -ESRCH;
+        fill_stats(tsk, stats);
+        put_task_struct(tsk);
+        return 0;
 }
-static int fill_tgid(pid_t tgid, struct task_struct *first,
+static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
-                struct taskstats *stats)
 {
-        struct task_struct *tsk;
+        struct task_struct *tsk, *first;
        unsigned long flags;
        int rc = -ESRCH;
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
         * leaders who are already counted with the dead tasks
         */
        rcu_read_lock();
-        if (!first)
+        first = find_task_by_vpid(tgid);
-                first = find_task_by_vpid(tgid);
        if (!first || !lock_task_sighand(first, &flags))
                goto out;
@@ -268,7 +263,6 @@ out:
        return rc;
 }
 static void fill_tgid_exit(struct task_struct *tsk)
 {
        unsigned long flags;
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
        struct nlattr *na, *ret;
        int aggr;
+        /* If we don't pad, we end up with alignment on a 4 byte boundary.
+         * This causes lots of runtime warnings on systems requiring 8 byte
+         * alignment */
+        u32 pids[2] = { pid, 0 };
+        int pid_size = ALIGN(sizeof(pid), sizeof(long));
        aggr = (type == TASKSTATS_TYPE_PID)
                        ? TASKSTATS_TYPE_AGGR_PID
                        : TASKSTATS_TYPE_AGGR_TGID;
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
        na = nla_nest_start(skb, aggr);
        if (!na)
                goto err;
-        if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+        if (nla_put(skb, type, pid_size, pids) < 0)
                goto err;
        ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
        if (!ret)
@@ -424,39 +424,46 @@ err:
        return rc;
 }
-static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+static int cmd_attr_register_cpumask(struct genl_info *info)
 {
-        int rc;
-        struct sk_buff *rep_skb;
-        struct taskstats *stats;
-        size_t size;
        cpumask_var_t mask;
+        int rc;
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
                return -ENOMEM;
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
        if (rc < 0)
-                goto free_return_rc;
+                goto out;
-        if (rc == 0) {
+        rc = add_del_listener(info->snd_pid, mask, REGISTER);
-                rc = add_del_listener(info->snd_pid, mask, REGISTER);
+out:
-                goto free_return_rc;
+        free_cpumask_var(mask);
-        }
+        return rc;
+}
+static int cmd_attr_deregister_cpumask(struct genl_info *info)
+{
+        cpumask_var_t mask;
+        int rc;
+        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+                return -ENOMEM;
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
        if (rc < 0)
-                goto free_return_rc;
+                goto out;
-        if (rc == 0) {
+        rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
-                rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+out:
-free_return_rc:
-                free_cpumask_var(mask);
-                return rc;
-        }
        free_cpumask_var(mask);
+        return rc;
+}
+static int cmd_attr_pid(struct genl_info *info)
+{
+        struct taskstats *stats;
+        struct sk_buff *rep_skb;
+        size_t size;
+        u32 pid;
+        int rc;
-        /*
-         * Size includes space for nested attributes
-         */
        size = nla_total_size(sizeof(u32)) +
                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
@@ -465,33 +472,64 @@ free_return_rc:
                return rc;
        rc = -EINVAL;
-        if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+        pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
-                u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
-                stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+        if (!stats)
-                if (!stats)
+                goto err;
-                        goto err;
+        rc = fill_stats_for_pid(pid, stats);
-                rc = fill_pid(pid, NULL, stats);
+        if (rc < 0)
-                if (rc < 0)
+                goto err;
-                        goto err;
+        return send_reply(rep_skb, info);
-        } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+err:
-                u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+        nlmsg_free(rep_skb);
-                stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+        return rc;
-                if (!stats)
+}
-                        goto err;
+static int cmd_attr_tgid(struct genl_info *info)
-                rc = fill_tgid(tgid, NULL, stats);
+{
-                if (rc < 0)
+        struct taskstats *stats;
-                        goto err;
+        struct sk_buff *rep_skb;
-        } else
+        size_t size;
+        u32 tgid;
+        int rc;
+        size = nla_total_size(sizeof(u32)) +
+                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
+        if (rc < 0)
+                return rc;
+        rc = -EINVAL;
+        tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+        if (!stats)
                goto err;
+        rc = fill_stats_for_tgid(tgid, stats);
+        if (rc < 0)
+                goto err;
        return send_reply(rep_skb, info);
 err:
        nlmsg_free(rep_skb);
        return rc;
 }
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+        if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
+                return cmd_attr_register_cpumask(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
+                return cmd_attr_deregister_cpumask(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
+                return cmd_attr_pid(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
+                return cmd_attr_tgid(info);
+        else
+                return -EINVAL;
+}
 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
 {
        struct signal_struct *sig = tsk->signal;
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        if (!stats)
                goto err;
-        rc = fill_pid(-1, tsk, stats);
+        fill_stats(tsk, stats);
-        if (rc < 0)
-                goto err;
        /*
         * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bcdef88..ea37e2ff4164 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -126,7 +126,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER if (!ARM_UNWIND)
+        select FRAME_POINTER if !ARM_UNWIND && !S390
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc251ed66724..7b8ec0281548 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
                                 BLK_TC_ACT(BLK_TC_WRITE) };
-#define BLK_TC_HARDBARRIER      BLK_TC_BARRIER
 #define BLK_TC_RAHEAD           BLK_TC_AHEAD
 /* The ilog2() calls fall out because they're constant */
@@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                return;
        what |= ddir_act[rw & WRITE];
-        what |= MASK_TC_BIT(rw, HARDBARRIER);
        what |= MASK_TC_BIT(rw, SYNC);
        what |= MASK_TC_BIT(rw, RAHEAD);
        what |= MASK_TC_BIT(rw, META);
@@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        if (rw & REQ_RAHEAD)
                rwbs[i++] = 'A';
-        if (rw & REQ_HARDBARRIER)
-                rwbs[i++] = 'B';
        if (rw & REQ_SYNC)
                rwbs[i++] = 'S';
        if (rw & REQ_META)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c3dab054d18e..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
        RB_LEN_TIME_STAMP = 16,
 };
+#define skip_time_extend(event) \
+        ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
        return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
        return length + RB_EVNT_HDR_SIZE;
 }
-/* inline for ring buffer fast paths */
+/*
-static unsigned
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
        switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
        return 0;
 }
+/*
+ * Return total length of time extend and data,
+ *   or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+        unsigned len = 0;
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                /* time extends include the data event after it */
+                len = RB_LEN_TIME_EXTEND;
+                event = skip_time_extend(event);
+        }
+        return len + rb_event_length(event);
+}
 /**
 * ring_buffer_event_length - return the length of the event
 * @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
 */
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
-        unsigned length = rb_event_length(event);
+        unsigned length;
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
+        length = rb_event_length(event);
        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                return length;
        length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
        /* If length is in len field, then array[0] has the data */
        if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE  (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
        struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
        iter->head = 0;
 }
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+        event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+        /* Not the first event on the page? */
+        if (rb_event_index(event)) {
+                event->time_delta = delta & TS_MASK;
+                event->array[0] = delta >> TS_SHIFT;
+        } else {
+                /* nope, just zero it */
+                event->time_delta = 0;
+                event->array[0] = 0;
+        }
+        return skip_time_extend(event);
+}
 /**
 * ring_buffer_update_event - update event type and data
 * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 * data field.
 */
 static void
-rb_update_event(struct ring_buffer_event *event,
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-                         unsigned type, unsigned length)
+                struct ring_buffer_event *event, unsigned length,
+                int add_timestamp, u64 delta)
 {
-        event->type_len = type;
+        /* Only a commit updates the timestamp */
+        if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-        switch (type) {
+                delta = 0;
-        case RINGBUF_TYPE_PADDING:
-        case RINGBUF_TYPE_TIME_EXTEND:
-        case RINGBUF_TYPE_TIME_STAMP:
-                break;
-        case 0:
+        /*
-                length -= RB_EVNT_HDR_SIZE;
+         * If we need to add a timestamp, then we
-                if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+         * add it to the start of the resevered space.
-                        event->array[0] = length;
+         */
-                else
+        if (unlikely(add_timestamp)) {
-                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+                event = rb_add_time_stamp(event, delta);
-                break;
+                length -= RB_LEN_TIME_EXTEND;
-        default:
+                delta = 0;
-                BUG();
        }
+        event->time_delta = delta;
+        length -= RB_EVNT_HDR_SIZE;
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+                event->type_len = 0;
+                event->array[0] = length;
+        } else
+                event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 }
 /*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        local_sub(length, &tail_page->write);
 }
-static struct ring_buffer_event *
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
             unsigned long length, unsigned long tail,
-             struct buffer_page *tail_page, u64 *ts)
+             struct buffer_page *tail_page, u64 ts)
 {
        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                 * Nested commits always have zero deltas, so
                 * just reread the time stamp
                 */
-                *ts = rb_time_stamp(buffer);
+                ts = rb_time_stamp(buffer);
-                next_page->page->time_stamp = *ts;
+                next_page->page->time_stamp = ts;
        }
 out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                  unsigned type, unsigned long length, u64 *ts)
+                  unsigned long length, u64 ts,
+                  u64 delta, int add_timestamp)
 {
        struct buffer_page *tail_page;
        struct ring_buffer_event *event;
        unsigned long tail, write;
+        /*
+         * If the time delta since the last event is too big to
+         * hold in the time field of the event, then we append a
+         * TIME EXTEND event ahead of the data event.
+         */
+        if (unlikely(add_timestamp))
+                length += RB_LEN_TIME_EXTEND;
        tail_page = cpu_buffer->tail_page;
        write = local_add_return(length, &tail_page->write);
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        tail = write - length;
        /* See if we shot pass the end of this buffer page */
-        if (write > BUF_PAGE_SIZE)
+        if (unlikely(write > BUF_PAGE_SIZE))
                return rb_move_tail(cpu_buffer, length, tail,
                                    tail_page, ts);
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
-        rb_update_event(event, type, length);
+        rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
-        /* The passed in type is zero for DATA */
+        local_inc(&tail_page->entries);
-        if (likely(!type))
-                local_inc(&tail_page->entries);
        /*
         * If this is the first commit on the page, then update
         * its timestamp.
         */
        if (!tail)
-                tail_page->page->time_stamp = *ts;
+                tail_page->page->time_stamp = ts;
        return event;
 }
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        unsigned long addr;
        new_index = rb_event_index(event);
-        old_index = new_index + rb_event_length(event);
+        old_index = new_index + rb_event_ts_length(event);
        addr = (unsigned long)event;
        addr &= PAGE_MASK;
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        return 0;
 }
-static int
-rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                  u64 *ts, u64 *delta)
-{
-        struct ring_buffer_event *event;
-        int ret;
-        WARN_ONCE(*delta > (1ULL << 59),
-                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                  (unsigned long long)*delta,
-                  (unsigned long long)*ts,
-                  (unsigned long long)cpu_buffer->write_stamp);
-        /*
-         * The delta is too big, we to add a
-         * new timestamp.
-         */
-        event = __rb_reserve_next(cpu_buffer,
-                                  RINGBUF_TYPE_TIME_EXTEND,
-                                  RB_LEN_TIME_EXTEND,
-                                  ts);
-        if (!event)
-                return -EBUSY;
-        if (PTR_ERR(event) == -EAGAIN)
-                return -EAGAIN;
-        /* Only a commited time event can update the write stamp */
-        if (rb_event_is_commit(cpu_buffer, event)) {
-                /*
-                 * If this is the first on the page, then it was
-                 * updated with the page itself. Try to discard it
-                 * and if we can't just make it zero.
-                 */
-                if (rb_event_index(event)) {
-                        event->time_delta = *delta & TS_MASK;
-                        event->array[0] = *delta >> TS_SHIFT;
-                } else {
-                        /* try to discard, since we do not need this */
-                        if (!rb_try_to_discard(cpu_buffer, event)) {
-                                /* nope, just zero it */
-                                event->time_delta = 0;
-                                event->array[0] = 0;
-                        }
-                }
-                cpu_buffer->write_stamp = *ts;
-                /* let the caller know this was the commit */
-                ret = 1;
-        } else {
-                /* Try to discard the event */
-                if (!rb_try_to_discard(cpu_buffer, event)) {
-                        /* Darn, this is just wasted space */
-                        event->time_delta = 0;
-                        event->array[0] = 0;
-                }
-                ret = 0;
-        }
-        *delta = 0;
-        return ret;
-}
 static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        local_inc(&cpu_buffer->committing);
        local_inc(&cpu_buffer->commits);
 }
-static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        unsigned long commits;
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                      unsigned long length)
 {
        struct ring_buffer_event *event;
-        u64 ts, delta = 0;
+        u64 ts, delta;
-        int commit = 0;
        int nr_loops = 0;
+        int add_timestamp;
+        u64 diff;
        rb_start_commit(cpu_buffer);
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
        length = rb_calculate_event_length(length);
 again:
+        add_timestamp = 0;
+        delta = 0;
        /*
         * We allow for interrupts to reenter here and do a trace.
         * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                goto out_fail;
        ts = rb_time_stamp(cpu_buffer->buffer);
+        diff = ts - cpu_buffer->write_stamp;
-        /*
+        /* make sure this diff is calculated here */
-         * Only the first commit can update the timestamp.
+        barrier();
-         * Yes there is a race here. If an interrupt comes in
-         * just after the conditional and it traces too, then it
-         * will also check the deltas. More than one timestamp may
-         * also be made. But only the entry that did the actual
-         * commit will be something other than zero.
-         */
-        if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
-                   rb_page_write(cpu_buffer->tail_page) ==
-                   rb_commit_index(cpu_buffer))) {
-                u64 diff;
-                diff = ts - cpu_buffer->write_stamp;
-                /* make sure this diff is calculated here */
-                barrier();
-                /* Did the write stamp get updated already? */
-                if (unlikely(ts < cpu_buffer->write_stamp))
-                        goto get_event;
+        /* Did the write stamp get updated already? */
+        if (likely(ts >= cpu_buffer->write_stamp)) {
                delta = diff;
                if (unlikely(test_time_stamp(delta))) {
+                        WARN_ONCE(delta > (1ULL << 59),
-                        commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+                                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                        if (commit == -EBUSY)
+                                  (unsigned long long)delta,
-                                goto out_fail;
+                                  (unsigned long long)ts,
+                                  (unsigned long long)cpu_buffer->write_stamp);
-                        if (commit == -EAGAIN)
+                        add_timestamp = 1;
-                                goto again;
-                        RB_WARN_ON(cpu_buffer, commit < 0);
                }
        }
- get_event:
+        event = __rb_reserve_next(cpu_buffer, length, ts,
-        event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+                                  delta, add_timestamp);
        if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
        if (!event)
                goto out_fail;
-        if (!rb_event_is_commit(cpu_buffer, event))
-                delta = 0;
-        event->time_delta = delta;
        return event;
 out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 #define TRACE_RECURSIVE_DEPTH 16
-static int trace_recursive_lock(void)
+/* Keep this code out of the fast path cache */
+static noinline void trace_recursive_fail(void)
 {
-        current->trace_recursion++;
-        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
-                return 0;
        /* Disable all tracing before we do anything else */
        tracing_off_permanent();
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
                    in_nmi());
        WARN_ON_ONCE(1);
+}
+static inline int trace_recursive_lock(void)
+{
+        current->trace_recursion++;
+        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+                return 0;
+        trace_recursive_fail();
        return -1;
 }
-static void trace_recursive_unlock(void)
+static inline void trace_recursive_unlock(void)
 {
        WARN_ON_ONCE(!current->trace_recursion);
@@ -2308,12 +2298,28 @@ static void
 rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
 {
+        u64 delta;
        /*
         * The event first in the commit queue updates the
         * time stamp.
         */
-        if (rb_event_is_commit(cpu_buffer, event))
+        if (rb_event_is_commit(cpu_buffer, event)) {
-                cpu_buffer->write_stamp += event->time_delta;
+                /*
+                 * A commit event that is first on a page
+                 * updates the write timestamp with the page stamp
+                 */
+                if (!rb_event_index(event))
+                        cpu_buffer->write_stamp =
+                                cpu_buffer->commit_page->page->time_stamp;
+                else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                        delta = event->array[0];
+                        delta <<= TS_SHIFT;
+                        delta += event->time_delta;
+                        cpu_buffer->write_stamp += delta;
+                } else
+                        cpu_buffer->write_stamp += event->time_delta;
+        }
 }
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
        /* array[0] holds the actual length for the discarded event */
        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
        event->type_len = RINGBUF_TYPE_PADDING;
@@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 again:
        /*
-         * We repeat when a timestamp is encountered. It is possible
+         * We repeat when a time extend is encountered.
-         * to get multiple timestamps from an interrupt entering just
+         * Since the time extend is always attached to a data event,
-         * as one timestamp is about to be written, or from discarded
+         * we should never loop more than once.
-         * commits. The most that we can have is the number on a single page.
+         * (We never hit the following condition more than twice).
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
        reader = rb_get_reader_page(cpu_buffer);
@@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
                return NULL;
        /*
-         * We repeat when a timestamp is encountered.
+         * We repeat when a time extend is encountered.
-         * We can get multiple timestamps by nested interrupts or also
+         * Since the time extend is always attached to a data event,
-         * if filtering is on (discarding commits). Since discarding
+         * we should never loop more than once.
-         * commits can be frequent we can get a lot of timestamps.
+         * (We never hit the following condition more than twice).
-         * But we limit them by not adding timestamps if they begin
-         * at the start of a page.
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
        if (rb_per_cpu_empty(cpu_buffer))
@@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                if (len > (commit - read))
                        len = (commit - read);
-                size = rb_event_length(event);
+                /* Always keep the time extend and data together */
+                size = rb_event_ts_length(event);
                if (len < size)
                        goto out_unlock;
@@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                                break;
                        event = rb_reader_event(cpu_buffer);
-                        size = rb_event_length(event);
+                        /* Always keep the time extend and data together */
+                        size = rb_event_ts_length(event);
                } while (len > size);
                /* update bpage */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 001bcd2ccf4a..c380612273bf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,7 +17,6 @@
 #include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
@@ -1284,6 +1283,8 @@ void trace_dump_stack(void)
        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
 }
+static DEFINE_PER_CPU(int, user_stack_count);
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1302,6 +1303,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (unlikely(in_nmi()))
                return;
+        /*
+         * prevent recursion, since the user stack tracing may
+         * trigger other kernel events.
+         */
+        preempt_disable();
+        if (__this_cpu_read(user_stack_count))
+                goto out;
+        __this_cpu_inc(user_stack_count);
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
@@ -1319,6 +1332,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        save_stack_trace_user(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+        __this_cpu_dec(user_stack_count);
+ out:
+        preempt_enable();
 }
 #ifdef UNUSED
@@ -3996,13 +4014,9 @@ static void tracing_init_debugfs_percpu(long cpu)
 {
        struct dentry *d_percpu = tracing_dentry_percpu();
        struct dentry *d_cpu;
-        /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
+        char cpu_dir[30]; /* 30 characters should be more than enough */
-        char cpu_dir[7];
-        if (cpu > 999 || cpu < 0)
-                return;
-        sprintf(cpu_dir, "cpu%ld", cpu);
+        snprintf(cpu_dir, 30, "cpu%ld", cpu);
        d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
                pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b8d2852baa4a..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
 #include <linux/perf_event.h>
 #include <linux/stringify.h>
 #include <linux/limits.h>
-#include <linux/uaccess.h>
 #include <asm/bitsperlong.h>
 #include "trace.h"
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
        stats->ac_ppid   = pid_alive(tsk) ?
                                rcu_dereference(tsk->real_parent)->tgid : 0;
        rcu_read_unlock();
-        stats->ac_utime  = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+        stats->ac_utime = cputime_to_usecs(tsk->utime);
-        stats->ac_stime  = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+        stats->ac_stime = cputime_to_usecs(tsk->stime);
-        stats->ac_utimescaled =
+        stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
-                cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
+        stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
-        stats->ac_stimescaled =
-                cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
        stats->ac_minflt = tsk->min_flt;
        stats->ac_majflt = tsk->maj_flt;
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 * upon function exit.
 */
 static void free_user(struct user_struct *up, unsigned long flags)
+        __releases(&uidhash_lock)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..b0310eb6cc1e 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
-/*
+/**
 * finish_wait - clean up after waiting in a queue
 * @q: waitqueue waited on
 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);
-/*
+/**
 * abort_exclusive_wait - abort exclusive waiting in a queue
 * @q: waitqueue waited on
 * @wait: wait descriptor
- * @state: runstate of the waiter to be woken
+ * @mode: runstate of the waiter to be woken
 * @key: key to identify a wait bit queue or %NULL
 *
 * Sets current thread back to running state and removes
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index bafba687a6d8..6e3c41a4024c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
-static int __initdata no_watchdog;
+static int no_watchdog;
 /* boot commands */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 30acdb74cc23..90db1bd1a978 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2064,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
         * checks and call back into the fixup functions where we
         * might deadlock.
         */
-        INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
+        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
        init_completion(&barr->done);
@@ -2791,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq)
                }
        }
-        /* just in case, make sure it's actually aligned */
+        /* just in case, make sure it's actually aligned
+         * - this is affected by PERCPU() alignment in vmlinux.lds.S
+         */
        BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
        return wq->cpu_wq.v ? 0 : -ENOMEM;
 }