90 files changed, 3554 insertions, 2596 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 17ea6d4a9a24..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 1f37f15117e5..72ab759a0b43 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
 * This function doesn't consume an skb as might be expected since it has to
 * copy it anyways.
 */
-static void kauditd_send_multicast_skb(struct sk_buff *skb)
+static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
 {
        struct sk_buff          *copy;
        struct audit_net        *aunet = net_generic(&init_net, audit_net_id);
@@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
         * no reason for new multicast clients to continue with this
         * non-compliance.
         */
-        copy = skb_copy(skb, GFP_KERNEL);
+        copy = skb_copy(skb, gfp_mask);
        if (!copy)
                return;
-        nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+        nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
 }
 /*
@@ -833,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                s.backlog_limit         = audit_backlog_limit;
                s.lost                  = atomic_read(&audit_lost);
                s.backlog               = skb_queue_len(&audit_skb_queue);
-                s.version               = AUDIT_VERSION_LATEST;
+                s.feature_bitmap        = AUDIT_FEATURE_BITMAP_ALL;
                s.backlog_wait_time     = audit_backlog_wait_time;
                audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
                break;
@@ -1100,7 +1100,7 @@ static void audit_receive(struct sk_buff  *skb)
 }
 /* Run custom bind function on netlink socket group connect or bind requests. */
-static int audit_bind(int group)
+static int audit_bind(struct net *net, int group)
 {
        if (!capable(CAP_AUDIT_READ))
                return -EPERM;
@@ -1940,7 +1940,7 @@ void audit_log_end(struct audit_buffer *ab)
                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
                nlh->nlmsg_len = ab->skb->len;
-                kauditd_send_multicast_skb(ab->skb);
+                kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
                /*
                 * The original kaudit unicast socket sends up messages with
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 80f29e015570..2e0c97427b33 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk)
        struct fsnotify_mark *entry = &chunk->mark;
        struct list_head *list;
-        if (!entry->i.inode)
+        if (!entry->inode)
                return;
-        list = chunk_hash(entry->i.inode);
+        list = chunk_hash(entry->inode);
        list_add_rcu(&chunk->hash, list);
 }
@@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
        list_for_each_entry_rcu(p, list, hash) {
                /* mark.inode may have gone NULL, but who cares? */
-                if (p->mark.i.inode == inode) {
+                if (p->mark.inode == inode) {
                        atomic_long_inc(&p->refs);
                        return p;
                }
@@ -231,7 +231,7 @@ static void untag_chunk(struct node *p)
                new = alloc_chunk(size);
        spin_lock(&entry->lock);
-        if (chunk->dead || !entry->i.inode) {
+        if (chunk->dead || !entry->inode) {
                spin_unlock(&entry->lock);
                if (new)
                        free_chunk(new);
@@ -258,7 +258,7 @@ static void untag_chunk(struct node *p)
                goto Fallback;
        fsnotify_duplicate_mark(&new->mark, entry);
-        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
+        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
                fsnotify_put_mark(&new->mark);
                goto Fallback;
        }
@@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        chunk_entry = &chunk->mark;
        spin_lock(&old_entry->lock);
-        if (!old_entry->i.inode) {
+        if (!old_entry->inode) {
                /* old_entry is being shot, lets just lie */
                spin_unlock(&old_entry->lock);
                fsnotify_put_mark(old_entry);
@@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        }
        fsnotify_duplicate_mark(chunk_entry, old_entry);
-        if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+        if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
                spin_unlock(&old_entry->lock);
                fsnotify_put_mark(chunk_entry);
                fsnotify_put_mark(old_entry);
@@ -611,7 +611,7 @@ void audit_trim_trees(void)
                list_for_each_entry(node, &tree->chunks, list) {
                        struct audit_chunk *chunk = find_chunk(node);
                        /* this could be NULL if the watch is dying else where... */
-                        struct inode *inode = chunk->mark.i.inode;
+                        struct inode *inode = chunk->mark.inode;
                        node->index |= 1U<<31;
                        if (iterate_mounts(compare_root, inode, root_mnt))
                                node->index &= ~(1U<<31);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3598e13f2a65..4f68a326d92e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -442,19 +442,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
                        f->type = AUDIT_LOGINUID_SET;
                        f->val = 0;
-                }
+                        entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
-                if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
-                        struct pid *pid;
-                        rcu_read_lock();
-                        pid = find_vpid(f->val);
-                        if (!pid) {
-                                rcu_read_unlock();
-                                err = -ESRCH;
-                                goto exit_free;
-                        }
-                        f->val = pid_nr(pid);
-                        rcu_read_unlock();
                }
                err = audit_field_valid(entry, f);
@@ -630,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, krule->filterkey);
                        break;
+                case AUDIT_LOGINUID_SET:
+                        if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
+                                data->fields[i] = AUDIT_LOGINUID;
+                                data->values[i] = AUDIT_UID_UNSET;
+                                break;
+                        }
+                        /* fallthrough if set */
                default:
                        data->values[i] = f->val;
                }
@@ -646,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
        int i;
        if (a->flags != b->flags ||
+            a->pflags != b->pflags ||
            a->listnr != b->listnr ||
            a->action != b->action ||
            a->field_count != b->field_count)
@@ -764,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
        new = &entry->rule;
        new->vers_ops = old->vers_ops;
        new->flags = old->flags;
+        new->pflags = old->pflags;
        new->listnr = old->listnr;
        new->action = old->action;
        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e420a0c41b5f..072566dd0caf 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -72,6 +72,8 @@
 #include <linux/fs_struct.h>
 #include <linux/compat.h>
 #include <linux/ctype.h>
+#include <linux/string.h>
+#include <uapi/linux/limits.h>
 #include "audit.h"
@@ -1861,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
        }
        list_for_each_entry_reverse(n, &context->names_list, list) {
-                /* does the name pointer match? */
+                if (!n->name || strcmp(n->name->name, name->name))
-                if (!n->name || n->name->name != name->name)
                        continue;
                /* match the correct record type */
@@ -1877,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
        }
 out_alloc:
-        /* unable to find the name from a previous getname(). Allocate a new
+        /* unable to find an entry with both a matching name and type */
-         * anonymous entry.
+        n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
-         */
-        n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
        if (!n)
                return;
+        /* unfortunately, while we may have a path name to record with the
+         * inode, we can't always rely on the string lasting until the end of
+         * the syscall so we need to create our own copy, it may fail due to
+         * memory allocation issues, but we do our best */
+        if (name) {
+                /* we can't use getname_kernel() due to size limits */
+                size_t len = strlen(name->name) + 1;
+                struct filename *new = __getname();
+                if (unlikely(!new))
+                        goto out;
+                if (len <= (PATH_MAX - sizeof(*new))) {
+                        new->name = (char *)(new) + sizeof(*new);
+                        new->separate = false;
+                } else if (len <= PATH_MAX) {
+                        /* this looks odd, but is due to final_putname() */
+                        struct filename *new2;
+                        new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
+                        if (unlikely(!new2)) {
+                                __putname(new);
+                                goto out;
+                        }
+                        new2->name = (char *)new;
+                        new2->separate = true;
+                        new = new2;
+                } else {
+                        /* we should never get here, but let's be safe */
+                        __putname(new);
+                        goto out;
+                }
+                strlcpy((char *)new->name, name->name, len);
+                new->uptr = NULL;
+                new->aname = n;
+                n->name = new;
+                n->name_put = true;
+        }
 out:
        if (parent) {
                n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1897,6 +1934,11 @@ out:
        audit_copy_inode(n, dentry, inode);
 }
+void __audit_file(const struct file *file)
+{
+        __audit_inode(NULL, file->f_path.dentry, 0);
+}
 /**
 * __audit_inode_child - collect inode info for created/removed objects
 * @parent: inode of dentry parent
@@ -2373,7 +2415,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
        ax->d.next = context->aux;
        context->aux = (void *)ax;
-        dentry = dget(bprm->file->f_dentry);
+        dentry = dget(bprm->file->f_path.dentry);
        get_vfs_caps_from_disk(dentry, &vcaps);
        dput(dentry);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+struct bpf_array {
+        struct bpf_map map;
+        u32 elem_size;
+        char value[0] __aligned(8);
+};
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+        struct bpf_array *array;
+        u32 elem_size, array_size;
+        /* check sanity of attributes */
+        if (attr->max_entries == 0 || attr->key_size != 4 ||
+            attr->value_size == 0)
+                return ERR_PTR(-EINVAL);
+        elem_size = round_up(attr->value_size, 8);
+        /* check round_up into zero and u32 overflow */
+        if (elem_size == 0 ||
+            attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
+                return ERR_PTR(-ENOMEM);
+        array_size = sizeof(*array) + attr->max_entries * elem_size;
+        /* allocate all map elements and zero-initialize them */
+        array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
+        if (!array) {
+                array = vzalloc(array_size);
+                if (!array)
+                        return ERR_PTR(-ENOMEM);
+        }
+        /* copy mandatory map attributes */
+        array->map.key_size = attr->key_size;
+        array->map.value_size = attr->value_size;
+        array->map.max_entries = attr->max_entries;
+        array->elem_size = elem_size;
+        return &array->map;
+}
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        u32 index = *(u32 *)key;
+        if (index >= array->map.max_entries)
+                return NULL;
+        return array->value + array->elem_size * index;
+}
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        u32 index = *(u32 *)key;
+        u32 *next = (u32 *)next_key;
+        if (index >= array->map.max_entries) {
+                *next = 0;
+                return 0;
+        }
+        if (index == array->map.max_entries - 1)
+                return -ENOENT;
+        *next = index + 1;
+        return 0;
+}
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+                                 u64 map_flags)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        u32 index = *(u32 *)key;
+        if (map_flags > BPF_EXIST)
+                /* unknown flags */
+                return -EINVAL;
+        if (index >= array->map.max_entries)
+                /* all elements were pre-allocated, cannot insert a new one */
+                return -E2BIG;
+        if (map_flags == BPF_NOEXIST)
+                /* all elements already exist */
+                return -EEXIST;
+        memcpy(array->value + array->elem_size * index, value, array->elem_size);
+        return 0;
+}
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+        return -EINVAL;
+}
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+         * so the programs (can be more than one that used this map) were
+         * disconnected from events. Wait for outstanding programs to complete
+         * and free the array
+         */
+        synchronize_rcu();
+        kvfree(array);
+}
+static struct bpf_map_ops array_ops = {
+        .map_alloc = array_map_alloc,
+        .map_free = array_map_free,
+        .map_get_next_key = array_map_get_next_key,
+        .map_lookup_elem = array_map_lookup_elem,
+        .map_update_elem = array_map_update_elem,
+        .map_delete_elem = array_map_delete_elem,
+};
+static struct bpf_map_type_list tl = {
+        .ops = &array_ops,
+        .type = BPF_MAP_TYPE_ARRAY,
+};
+static int __init register_array_map(void)
+{
+        bpf_register_map_type(&tl);
+        return 0;
+}
+late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d6594e457a25..a64e7a207d2b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
-        module_free(NULL, hdr);
+        module_memfree(hdr);
 }
 #endif /* CONFIG_BPF_JIT */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+struct bpf_htab {
+        struct bpf_map map;
+        struct hlist_head *buckets;
+        spinlock_t lock;
+        u32 count;      /* number of elements in this hashtable */
+        u32 n_buckets;  /* number of hash buckets */
+        u32 elem_size;  /* size of each element in bytes */
+};
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+        struct hlist_node hash_node;
+        struct rcu_head rcu;
+        u32 hash;
+        char key[0] __aligned(8);
+};
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+        struct bpf_htab *htab;
+        int err, i;
+        htab = kzalloc(sizeof(*htab), GFP_USER);
+        if (!htab)
+                return ERR_PTR(-ENOMEM);
+        /* mandatory map attributes */
+        htab->map.key_size = attr->key_size;
+        htab->map.value_size = attr->value_size;
+        htab->map.max_entries = attr->max_entries;
+        /* check sanity of attributes.
+         * value_size == 0 may be allowed in the future to use map as a set
+         */
+        err = -EINVAL;
+        if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+            htab->map.value_size == 0)
+                goto free_htab;
+        /* hash table size must be power of 2 */
+        htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+        err = -E2BIG;
+        if (htab->map.key_size > MAX_BPF_STACK)
+                /* eBPF programs initialize keys on stack, so they cannot be
+                 * larger than max stack size
+                 */
+                goto free_htab;
+        err = -ENOMEM;
+        /* prevent zero size kmalloc and check for u32 overflow */
+        if (htab->n_buckets == 0 ||
+            htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+                goto free_htab;
+        htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+                                      GFP_USER | __GFP_NOWARN);
+        if (!htab->buckets) {
+                htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+                if (!htab->buckets)
+                        goto free_htab;
+        }
+        for (i = 0; i < htab->n_buckets; i++)
+                INIT_HLIST_HEAD(&htab->buckets[i]);
+        spin_lock_init(&htab->lock);
+        htab->count = 0;
+        htab->elem_size = sizeof(struct htab_elem) +
+                          round_up(htab->map.key_size, 8) +
+                          htab->map.value_size;
+        return &htab->map;
+free_htab:
+        kfree(htab);
+        return ERR_PTR(err);
+}
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+        return jhash(key, key_len, 0);
+}
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+        return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+                                         void *key, u32 key_size)
+{
+        struct htab_elem *l;
+        hlist_for_each_entry_rcu(l, head, hash_node)
+                if (l->hash == hash && !memcmp(&l->key, key, key_size))
+                        return l;
+        return NULL;
+}
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+        struct hlist_head *head;
+        struct htab_elem *l;
+        u32 hash, key_size;
+        /* Must be called with rcu_read_lock. */
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        key_size = map->key_size;
+        hash = htab_map_hash(key, key_size);
+        head = select_bucket(htab, hash);
+        l = lookup_elem_raw(head, hash, key, key_size);
+        if (l)
+                return l->key + round_up(map->key_size, 8);
+        return NULL;
+}
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+        struct hlist_head *head;
+        struct htab_elem *l, *next_l;
+        u32 hash, key_size;
+        int i;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        key_size = map->key_size;
+        hash = htab_map_hash(key, key_size);
+        head = select_bucket(htab, hash);
+        /* lookup the key */
+        l = lookup_elem_raw(head, hash, key, key_size);
+        if (!l) {
+                i = 0;
+                goto find_first_elem;
+        }
+        /* key was found, get next key in the same bucket */
+        next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+                                  struct htab_elem, hash_node);
+        if (next_l) {
+                /* if next elem in this hash list is non-zero, just return it */
+                memcpy(next_key, next_l->key, key_size);
+                return 0;
+        }
+        /* no more elements in this hash list, go to the next bucket */
+        i = hash & (htab->n_buckets - 1);
+        i++;
+find_first_elem:
+        /* iterate over buckets */
+        for (; i < htab->n_buckets; i++) {
+                head = select_bucket(htab, i);
+                /* pick first element in the bucket */
+                next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+                                          struct htab_elem, hash_node);
+                if (next_l) {
+                        /* if it's not empty, just return it */
+                        memcpy(next_key, next_l->key, key_size);
+                        return 0;
+                }
+        }
+        /* itereated over all buckets and all elements */
+        return -ENOENT;
+}
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+                                u64 map_flags)
+{
+        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+        struct htab_elem *l_new, *l_old;
+        struct hlist_head *head;
+        unsigned long flags;
+        u32 key_size;
+        int ret;
+        if (map_flags > BPF_EXIST)
+                /* unknown flags */
+                return -EINVAL;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        /* allocate new element outside of lock */
+        l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+        if (!l_new)
+                return -ENOMEM;
+        key_size = map->key_size;
+        memcpy(l_new->key, key, key_size);
+        memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+        l_new->hash = htab_map_hash(l_new->key, key_size);
+        /* bpf_map_update_elem() can be called in_irq() */
+        spin_lock_irqsave(&htab->lock, flags);
+        head = select_bucket(htab, l_new->hash);
+        l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+        if (!l_old && unlikely(htab->count >= map->max_entries)) {
+                /* if elem with this 'key' doesn't exist and we've reached
+                 * max_entries limit, fail insertion of new elem
+                 */
+                ret = -E2BIG;
+                goto err;
+        }
+        if (l_old && map_flags == BPF_NOEXIST) {
+                /* elem already exists */
+                ret = -EEXIST;
+                goto err;
+        }
+        if (!l_old && map_flags == BPF_EXIST) {
+                /* elem doesn't exist, cannot update it */
+                ret = -ENOENT;
+                goto err;
+        }
+        /* add new element to the head of the list, so that concurrent
+         * search will find it before old elem
+         */
+        hlist_add_head_rcu(&l_new->hash_node, head);
+        if (l_old) {
+                hlist_del_rcu(&l_old->hash_node);
+                kfree_rcu(l_old, rcu);
+        } else {
+                htab->count++;
+        }
+        spin_unlock_irqrestore(&htab->lock, flags);
+        return 0;
+err:
+        spin_unlock_irqrestore(&htab->lock, flags);
+        kfree(l_new);
+        return ret;
+}
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+        struct hlist_head *head;
+        struct htab_elem *l;
+        unsigned long flags;
+        u32 hash, key_size;
+        int ret = -ENOENT;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        key_size = map->key_size;
+        hash = htab_map_hash(key, key_size);
+        spin_lock_irqsave(&htab->lock, flags);
+        head = select_bucket(htab, hash);
+        l = lookup_elem_raw(head, hash, key, key_size);
+        if (l) {
+                hlist_del_rcu(&l->hash_node);
+                htab->count--;
+                kfree_rcu(l, rcu);
+                ret = 0;
+        }
+        spin_unlock_irqrestore(&htab->lock, flags);
+        return ret;
+}
+static void delete_all_elements(struct bpf_htab *htab)
+{
+        int i;
+        for (i = 0; i < htab->n_buckets; i++) {
+                struct hlist_head *head = select_bucket(htab, i);
+                struct hlist_node *n;
+                struct htab_elem *l;
+                hlist_for_each_entry_safe(l, n, head, hash_node) {
+                        hlist_del_rcu(&l->hash_node);
+                        htab->count--;
+                        kfree(l);
+                }
+        }
+}
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+        /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+         * so the programs (can be more than one that used this map) were
+         * disconnected from events. Wait for outstanding critical sections in
+         * these programs to complete
+         */
+        synchronize_rcu();
+        /* some of kfree_rcu() callbacks for elements of this map may not have
+         * executed. It's ok. Proceed to free residual elements and map itself
+         */
+        delete_all_elements(htab);
+        kvfree(htab->buckets);
+        kfree(htab);
+}
+static struct bpf_map_ops htab_ops = {
+        .map_alloc = htab_map_alloc,
+        .map_free = htab_map_free,
+        .map_get_next_key = htab_map_get_next_key,
+        .map_lookup_elem = htab_map_lookup_elem,
+        .map_update_elem = htab_map_update_elem,
+        .map_delete_elem = htab_map_delete_elem,
+};
+static struct bpf_map_type_list tl = {
+        .ops = &htab_ops,
+        .type = BPF_MAP_TYPE_HASH,
+};
+static int __init register_htab_map(void)
+{
+        bpf_register_map_type(&tl);
+        return 0;
+}
+late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        /* verifier checked that R1 contains a valid pointer to bpf_map
+         * and R2 points to a program stack and map->key_size bytes were
+         * initialized
+         */
+        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+        void *key = (void *) (unsigned long) r2;
+        void *value;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        value = map->ops->map_lookup_elem(map, key);
+        /* lookup() returns either pointer to element value or NULL
+         * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+         */
+        return (unsigned long) value;
+}
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+        .func = bpf_map_lookup_elem,
+        .gpl_only = false,
+        .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+        .arg1_type = ARG_CONST_MAP_PTR,
+        .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+        void *key = (void *) (unsigned long) r2;
+        void *value = (void *) (unsigned long) r3;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        return map->ops->map_update_elem(map, key, value, r4);
+}
+struct bpf_func_proto bpf_map_update_elem_proto = {
+        .func = bpf_map_update_elem,
+        .gpl_only = false,
+        .ret_type = RET_INTEGER,
+        .arg1_type = ARG_CONST_MAP_PTR,
+        .arg2_type = ARG_PTR_TO_MAP_KEY,
+        .arg3_type = ARG_PTR_TO_MAP_VALUE,
+        .arg4_type = ARG_ANYTHING,
+};
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+        void *key = (void *) (unsigned long) r2;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        return map->ops->map_delete_elem(map, key);
+}
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+        .func = bpf_map_delete_elem,
+        .gpl_only = false,
+        .ret_type = RET_INTEGER,
+        .arg1_type = ARG_CONST_MAP_PTR,
+        .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
        if (copy_from_user(key, ukey, map->key_size) != 0)
                goto free_key;
-        err = -ESRCH;
+        err = -ENOENT;
        rcu_read_lock();
        value = map->ops->map_lookup_elem(map, key);
        if (!value)
@@ -190,7 +190,7 @@ err_put:
        return err;
 }
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 static int map_update_elem(union bpf_attr *attr)
 {
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
         * therefore all map accessors rely on this fact, so do the same here
         */
        rcu_read_lock();
-        err = map->ops->map_update_elem(map, key, value);
+        err = map->ops->map_update_elem(map, key, value, attr->flags);
        rcu_read_unlock();
 free_value:
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
        u64 arg2;
 };
-static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-        return 0;
-}
-static struct bpf_func_proto test_funcs[] = {
-        [BPF_FUNC_unspec] = {
-                .func = test_func,
-                .gpl_only = true,
-                .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
-                .arg1_type = ARG_CONST_MAP_PTR,
-                .arg2_type = ARG_PTR_TO_MAP_KEY,
-        },
-};
 static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
 {
-        if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+        switch (func_id) {
+        case BPF_FUNC_map_lookup_elem:
+                return &bpf_map_lookup_elem_proto;
+        case BPF_FUNC_map_update_elem:
+                return &bpf_map_update_elem_proto;
+        case BPF_FUNC_map_delete_elem:
+                return &bpf_map_delete_elem_proto;
+        default:
                return NULL;
-        return &test_funcs[func_id];
+        }
 }
 static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
        .type = BPF_PROG_TYPE_UNSPEC,
 };
-static struct bpf_map *test_map_alloc(union bpf_attr *attr)
-{
-        struct bpf_map *map;
-        map = kzalloc(sizeof(*map), GFP_USER);
-        if (!map)
-                return ERR_PTR(-ENOMEM);
-        map->key_size = attr->key_size;
-        map->value_size = attr->value_size;
-        map->max_entries = attr->max_entries;
-        return map;
-}
-static void test_map_free(struct bpf_map *map)
-{
-        kfree(map);
-}
-static struct bpf_map_ops test_map_ops = {
-        .map_alloc = test_map_alloc,
-        .map_free = test_map_free,
-};
-static struct bpf_map_type_list tl_map = {
-        .ops = &test_map_ops,
-        .type = BPF_MAP_TYPE_UNSPEC,
-};
 static int __init register_test_ops(void)
 {
-        bpf_register_map_type(&tl_map);
        bpf_register_prog_type(&tl_prog);
        return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9f81818f2941..a28e09c7825d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,22 +153,19 @@ struct reg_state {
 enum bpf_stack_slot_type {
        STACK_INVALID,    /* nothing was stored in this stack slot */
-        STACK_SPILL,      /* 1st byte of register spilled into stack */
+        STACK_SPILL,      /* register spilled into stack */
-        STACK_SPILL_PART, /* other 7 bytes of register spill */
        STACK_MISC        /* BPF program wrote some data into this slot */
 };
-struct bpf_stack_slot {
+#define BPF_REG_SIZE 8  /* size of eBPF register in bytes */
-        enum bpf_stack_slot_type stype;
-        struct reg_state reg_st;
-};
 /* state of the program:
 * type of all registers and stack info
 */
 struct verifier_state {
        struct reg_state regs[MAX_BPF_REG];
-        struct bpf_stack_slot stack[MAX_BPF_STACK];
+        u8 stack_slot_type[MAX_BPF_STACK];
+        struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
 };
 /* linked list of verifier states used to prune search */
@@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env)
                                env->cur_state.regs[i].map_ptr->key_size,
                                env->cur_state.regs[i].map_ptr->value_size);
        }
-        for (i = 0; i < MAX_BPF_STACK; i++) {
+        for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-                if (env->cur_state.stack[i].stype == STACK_SPILL)
+                if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
                        verbose(" fp%d=%s", -MAX_BPF_STACK + i,
-                                reg_type_str[env->cur_state.stack[i].reg_st.type]);
+                                reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
        }
        verbose("\n");
 }
@@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size)
 static int check_stack_write(struct verifier_state *state, int off, int size,
                             int value_regno)
 {
-        struct bpf_stack_slot *slot;
        int i;
+        /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
+         * so it's aligned access and [off, off + size) are within stack limits
+         */
        if (value_regno >= 0 &&
            (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
@@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
             state->regs[value_regno].type == PTR_TO_CTX)) {
                /* register containing pointer is being spilled into stack */
-                if (size != 8) {
+                if (size != BPF_REG_SIZE) {
                        verbose("invalid size of register spill\n");
                        return -EACCES;
                }
-                slot = &state->stack[MAX_BPF_STACK + off];
-                slot->stype = STACK_SPILL;
                /* save register state */
-                slot->reg_st = state->regs[value_regno];
+                state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
-                for (i = 1; i < 8; i++) {
+                        state->regs[value_regno];
-                        slot = &state->stack[MAX_BPF_STACK + off + i];
-                        slot->stype = STACK_SPILL_PART;
-                        slot->reg_st.type = UNKNOWN_VALUE;
-                        slot->reg_st.map_ptr = NULL;
-                }
-        } else {
+                for (i = 0; i < BPF_REG_SIZE; i++)
+                        state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+        } else {
                /* regular write of data into stack */
-                for (i = 0; i < size; i++) {
+                state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
-                        slot = &state->stack[MAX_BPF_STACK + off + i];
+                        (struct reg_state) {};
-                        slot->stype = STACK_MISC;
-                        slot->reg_st.type = UNKNOWN_VALUE;
+                for (i = 0; i < size; i++)
-                        slot->reg_st.map_ptr = NULL;
+                        state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
-                }
        }
        return 0;
 }
@@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 static int check_stack_read(struct verifier_state *state, int off, int size,
                            int value_regno)
 {
+        u8 *slot_type;
        int i;
-        struct bpf_stack_slot *slot;
-        slot = &state->stack[MAX_BPF_STACK + off];
+        slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
-        if (slot->stype == STACK_SPILL) {
+        if (slot_type[0] == STACK_SPILL) {
-                if (size != 8) {
+                if (size != BPF_REG_SIZE) {
                        verbose("invalid size of register spill\n");
                        return -EACCES;
                }
-                for (i = 1; i < 8; i++) {
+                for (i = 1; i < BPF_REG_SIZE; i++) {
-                        if (state->stack[MAX_BPF_STACK + off + i].stype !=
+                        if (slot_type[i] != STACK_SPILL) {
-                            STACK_SPILL_PART) {
                                verbose("corrupted spill memory\n");
                                return -EACCES;
                        }
@@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
                if (value_regno >= 0)
                        /* restore register state from stack */
-                        state->regs[value_regno] = slot->reg_st;
+                        state->regs[value_regno] =
+                                state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
                return 0;
        } else {
                for (i = 0; i < size; i++) {
-                        if (state->stack[MAX_BPF_STACK + off + i].stype !=
+                        if (slot_type[i] != STACK_MISC) {
-                            STACK_MISC) {
                                verbose("invalid read from stack off %d+%d size %d\n",
                                        off, i, size);
                                return -EACCES;
@@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env,
        }
        for (i = 0; i < access_size; i++) {
-                if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+                if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
                        verbose("invalid indirect read from stack off %d+%d size %d\n",
                                off, i, access_size);
                        return -EACCES;
@@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
        return 0;
 }
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ *   preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ *   ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ *   SRC == any register
+ *   IMM == 32-bit immediate
+ *
+ * Output:
+ *   R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+        struct reg_state *regs = env->cur_state.regs;
+        u8 mode = BPF_MODE(insn->code);
+        struct reg_state *reg;
+        int i, err;
+        if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+                verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+                return -EINVAL;
+        }
+        if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+            (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+                verbose("BPF_LD_ABS uses reserved fields\n");
+                return -EINVAL;
+        }
+        /* check whether implicit source operand (register R6) is readable */
+        err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+        if (err)
+                return err;
+        if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+                verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+                return -EINVAL;
+        }
+        if (mode == BPF_IND) {
+                /* check explicit source operand */
+                err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+                if (err)
+                        return err;
+        }
+        /* reset caller saved regs to unreadable */
+        for (i = 0; i < CALLER_SAVED_REGS; i++) {
+                reg = regs + caller_saved[i];
+                reg->type = NOT_INIT;
+                reg->imm = 0;
+        }
+        /* mark destination R0 register as readable, since it contains
+         * the value fetched from the packet
+         */
+        regs[BPF_REG_0].type = UNKNOWN_VALUE;
+        return 0;
+}
 /* non-recursive DFS pseudo code
 * 1  procedure DFS-iterative(G,v):
 * 2      label v as discovered
@@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
        }
        for (i = 0; i < MAX_BPF_STACK; i++) {
-                if (memcmp(&old->stack[i], &cur->stack[i],
+                if (old->stack_slot_type[i] == STACK_INVALID)
-                           sizeof(old->stack[0])) != 0) {
+                        continue;
-                        if (old->stack[i].stype == STACK_INVALID)
+                if (old->stack_slot_type[i] != cur->stack_slot_type[i])
-                                continue;
+                        /* Ex: old explored (safe) state has STACK_SPILL in
+                         * this stack slot, but current has has STACK_MISC ->
+                         * this verifier states are not equivalent,
+                         * return false to continue verification of this path
+                         */
                        return false;
-                }
+                if (i % BPF_REG_SIZE)
+                        continue;
+                if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
+                           &cur->spilled_regs[i / BPF_REG_SIZE],
+                           sizeof(old->spilled_regs[0])))
+                        /* when explored and current stack slot types are
+                         * the same, check that stored pointers types
+                         * are the same as well.
+                         * Ex: explored safe path could have stored
+                         * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+                         * but current path has stored:
+                         * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+                         * such verifier states are not equivalent.
+                         * return false to continue verification of this path
+                         */
+                        return false;
+                else
+                        continue;
        }
        return true;
 }
@@ -1664,8 +1741,10 @@ process_bpf_exit:
                        u8 mode = BPF_MODE(insn->code);
                        if (mode == BPF_ABS || mode == BPF_IND) {
-                                verbose("LD_ABS is not supported yet\n");
+                                err = check_ld_abs(env, insn);
-                                return -EINVAL;
+                                if (err)
+                                        return err;
                        } else if (mode == BPF_IMM) {
                                err = check_ld_imm(env, insn);
                                if (err)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 136eceadeed1..bb263d0caab3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
        if (!(cgrp->root->subsys_mask & (1 << ss->id)))
                return NULL;
+        /*
+         * This function is used while updating css associations and thus
+         * can't test the csses directly.  Use ->child_subsys_mask.
+         */
        while (cgroup_parent(cgrp) &&
               !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
                cgrp = cgroup_parent(cgrp);
@@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
        return cgroup_css(cgrp, ss);
 }
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss.  The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+                                             struct cgroup_subsys *ss)
+{
+        struct cgroup_subsys_state *css;
+        rcu_read_lock();
+        do {
+                css = cgroup_css(cgrp, ss);
+                if (css && css_tryget_online(css))
+                        goto out_unlock;
+                cgrp = cgroup_parent(cgrp);
+        } while (cgrp);
+        css = init_css_set.subsys[ss->id];
+        css_get(css);
+out_unlock:
+        rcu_read_unlock();
+        return css;
+}
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
@@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
 }
 /**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
 * @cgrp: the target cgroup
+ * @subtree_control: the new subtree_control mask to consider
 *
 * On the default hierarchy, a subsystem may request other subsystems to be
 * enabled together through its ->depends_on mask.  In such cases, more
 * subsystems than specified in "cgroup.subtree_control" may be enabled.
 *
- * This function determines which subsystems need to be enabled given the
+ * This function calculates which subsystems need to be enabled if
- * current @cgrp->subtree_control and records it in
+ * @subtree_control is to be applied to @cgrp.  The returned mask is always
- * @cgrp->child_subsys_mask.  The resulting mask is always a superset of
+ * a superset of @subtree_control and follows the usual hierarchy rules.
- * @cgrp->subtree_control and follows the usual hierarchy rules.
 */
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+                                                  unsigned int subtree_control)
 {
        struct cgroup *parent = cgroup_parent(cgrp);
-        unsigned int cur_ss_mask = cgrp->subtree_control;
+        unsigned int cur_ss_mask = subtree_control;
        struct cgroup_subsys *ss;
        int ssid;
        lockdep_assert_held(&cgroup_mutex);
-        if (!cgroup_on_dfl(cgrp)) {
+        if (!cgroup_on_dfl(cgrp))
-                cgrp->child_subsys_mask = cur_ss_mask;
+                return cur_ss_mask;
-                return;
-        }
        while (true) {
                unsigned int new_ss_mask = cur_ss_mask;
@@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
                cur_ss_mask = new_ss_mask;
        }
-        cgrp->child_subsys_mask = cur_ss_mask;
+        return cur_ss_mask;
+}
+/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * Update @cgrp->child_subsys_mask according to the current
+ * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+        cgrp->child_subsys_mask =
+                cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
 }
 /**
@@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            loff_t off)
 {
        unsigned int enable = 0, disable = 0;
-        unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
+        unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
@@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                ret = -ENOENT;
                                goto out_unlock;
                        }
-                        /*
-                         * @ss is already enabled through dependency and
-                         * we'll just make it visible.  Skip draining.
-                         */
-                        if (cgrp->child_subsys_mask & (1 << ssid))
-                                continue;
-                        /*
-                         * Because css offlining is asynchronous, userland
-                         * might try to re-enable the same controller while
-                         * the previous instance is still around.  In such
-                         * cases, wait till it's gone using offline_waitq.
-                         */
-                        cgroup_for_each_live_child(child, cgrp) {
-                                DEFINE_WAIT(wait);
-                                if (!cgroup_css(child, ss))
-                                        continue;
-                                cgroup_get(child);
-                                prepare_to_wait(&child->offline_waitq, &wait,
-                                                TASK_UNINTERRUPTIBLE);
-                                cgroup_kn_unlock(of->kn);
-                                schedule();
-                                finish_wait(&child->offline_waitq, &wait);
-                                cgroup_put(child);
-                                return restart_syscall();
-                        }
                } else if (disable & (1 << ssid)) {
                        if (!(cgrp->subtree_control & (1 << ssid))) {
                                disable &= ~(1 << ssid);
@@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
         * subsystems than specified may need to be enabled or disabled
         * depending on subsystem dependencies.
         */
-        cgrp->subtree_control |= enable;
+        old_sc = cgrp->subtree_control;
-        cgrp->subtree_control &= ~disable;
+        old_ss = cgrp->child_subsys_mask;
+        new_sc = (old_sc | enable) & ~disable;
+        new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
-        old_ctrl = cgrp->child_subsys_mask;
+        css_enable = ~old_ss & new_ss;
-        cgroup_refresh_child_subsys_mask(cgrp);
+        css_disable = old_ss & ~new_ss;
-        new_ctrl = cgrp->child_subsys_mask;
-        css_enable = ~old_ctrl & new_ctrl;
-        css_disable = old_ctrl & ~new_ctrl;
        enable |= css_enable;
        disable |= css_disable;
        /*
+         * Because css offlining is asynchronous, userland might try to
+         * re-enable the same controller while the previous instance is
+         * still around.  In such cases, wait till it's gone using
+         * offline_waitq.
+         */
+        for_each_subsys(ss, ssid) {
+                if (!(css_enable & (1 << ssid)))
+                        continue;
+                cgroup_for_each_live_child(child, cgrp) {
+                        DEFINE_WAIT(wait);
+                        if (!cgroup_css(child, ss))
+                                continue;
+                        cgroup_get(child);
+                        prepare_to_wait(&child->offline_waitq, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        cgroup_kn_unlock(of->kn);
+                        schedule();
+                        finish_wait(&child->offline_waitq, &wait);
+                        cgroup_put(child);
+                        return restart_syscall();
+                }
+        }
+        cgrp->subtree_control = new_sc;
+        cgrp->child_subsys_mask = new_ss;
+        /*
         * Create new csses or make the existing ones visible.  A css is
         * created invisible if it's being implicitly enabled through
         * dependency.  An invisible css is made visible when the userland
@@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                }
        }
+        /*
+         * The effective csses of all the descendants (excluding @cgrp) may
+         * have changed.  Subsystems can optionally subscribe to this event
+         * by implementing ->css_e_css_changed() which is invoked if any of
+         * the effective csses seen from the css's cgroup may have changed.
+         */
+        for_each_subsys(ss, ssid) {
+                struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
+                struct cgroup_subsys_state *css;
+                if (!ss->css_e_css_changed || !this_css)
+                        continue;
+                css_for_each_descendant_pre(css, this_css)
+                        if (css != this_css)
+                                ss->css_e_css_changed(css);
+        }
        kernfs_activate(cgrp->kn);
        ret = 0;
 out_unlock:
@@ -2832,9 +2898,8 @@ out_unlock:
        return ret ?: nbytes;
 err_undo_css:
-        cgrp->subtree_control &= ~enable;
+        cgrp->subtree_control = old_sc;
-        cgrp->subtree_control |= disable;
+        cgrp->child_subsys_mask = old_ss;
-        cgroup_refresh_child_subsys_mask(cgrp);
        for_each_subsys(ss, ssid) {
                if (!(enable & (1 << ssid)))
@@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
        if (ss) {
                /* css release path */
                cgroup_idr_remove(&ss->css_idr, css->id);
+                if (ss->css_released)
+                        ss->css_released(css);
        } else {
                /* cgroup release path */
                cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 723cfc9d0ad7..64b257f6bca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
 /*
- * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
- * and callback_mutex.  The latter may nest inside the former.  We also
+ * callback_lock. We also require taking task_lock() when dereferencing a
- * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
- * See "The task_lock() exception", at the end of this comment.
+ * comment.
 *
- * A task must hold both mutexes to modify cpusets.  If a task holds
+ * A task must hold both locks to modify cpusets.  If a task holds
 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_mutex and be able to
+ * is the only task able to also acquire callback_lock and be able to
 * modify cpusets.  It can perform various checks on the cpuset structure
 * first, knowing nothing will change.  It can also allocate memory while
 * just holding cpuset_mutex.  While it is performing these checks, various
- * callback routines can briefly acquire callback_mutex to query cpusets.
+ * callback routines can briefly acquire callback_lock to query cpusets.
- * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * Once it is ready to make the changes, it takes callback_lock, blocking
 * everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
- * callback_mutex, as that would risk double tripping on callback_mutex
+ * callback_lock, as that would risk double tripping on callback_lock
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
- * If a task is only holding callback_mutex, then it has read-only
+ * If a task is only holding callback_lock, then it has read-only
 * access to cpusets.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
- * The cpuset_common_file_read() handlers only hold callback_mutex across
+ * The cpuset_common_file_read() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
 */
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 /*
 * CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
 *
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
 */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 * One way or another, we guarantee to return some non-empty subset
 * of node_states[N_MEMORY].
 *
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
 */
 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 {
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 /*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
- * Called with callback_mutex/cpuset_mutex held
+ * Call with callback_lock or cpuset_mutex held.
 */
 static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
@@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                        continue;
                rcu_read_unlock();
-                mutex_lock(&callback_mutex);
+                spin_lock_irq(&callback_lock);
                cpumask_copy(cp->effective_cpus, new_cpus);
-                mutex_unlock(&callback_mutex);
+                spin_unlock_irq(&callback_lock);
                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        if (retval < 0)
                return retval;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        /* use trialcs->cpus_allowed as a temp variable */
        update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                        continue;
                rcu_read_unlock();
-                mutex_lock(&callback_mutex);
+                spin_lock_irq(&callback_lock);
                cp->effective_mems = *new_mems;
-                mutex_unlock(&callback_mutex);
+                spin_unlock_irq(&callback_lock);
                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
- * Call with cpuset_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
@@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        if (retval < 0)
                goto done;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cs->mems_allowed = trialcs->mems_allowed;
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        /* use trialcs->mems_allowed as a temp variable */
        update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
                        || (is_spread_page(cs) != is_spread_page(trialcs)));
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cs->flags = trialcs->flags;
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                rebuild_sched_domains_locked();
@@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
        count = seq_get_buf(sf, &buf);
        s = buf;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        switch (type) {
        case FILE_CPULIST:
@@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
                seq_commit(sf, -1);
        }
 out_unlock:
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        return ret;
 }
@@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        cpuset_inc();
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        if (cgroup_on_dfl(cs->css.cgroup)) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
        }
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;
@@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        }
        rcu_read_unlock();
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cs->mems_allowed = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
 out_unlock:
        mutex_unlock(&cpuset_mutex);
        return 0;
@@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 static void cpuset_bind(struct cgroup_subsys_state *root_css)
 {
        mutex_lock(&cpuset_mutex);
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        if (cgroup_on_dfl(root_css->cgroup)) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
                top_cpuset.mems_allowed = top_cpuset.effective_mems;
        }
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        mutex_unlock(&cpuset_mutex);
 }
@@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 {
        bool is_empty;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, new_cpus);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->mems_allowed = *new_mems;
        cs->effective_mems = *new_mems;
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        /*
         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
        if (nodes_empty(*new_mems))
                *new_mems = parent_cs(cs)->effective_mems;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->effective_mems = *new_mems;
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        if (cpus_updated)
                update_tasks_cpumask(cs);
@@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
-                mutex_lock(&callback_mutex);
+                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
-                mutex_unlock(&callback_mutex);
+                spin_unlock_irq(&callback_lock);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }
        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
-                mutex_lock(&callback_mutex);
+                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        top_cpuset.mems_allowed = new_mems;
                top_cpuset.effective_mems = new_mems;
-                mutex_unlock(&callback_mutex);
+                spin_unlock_irq(&callback_lock);
                update_tasks_nodemask(&top_cpuset);
        }
@@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
-        mutex_lock(&callback_mutex);
+        unsigned long flags;
+        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_cpus(task_cs(tsk), pmask);
        rcu_read_unlock();
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irqrestore(&callback_lock, flags);
 }
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
        nodemask_t mask;
+        unsigned long flags;
-        mutex_lock(&callback_mutex);
+        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irqrestore(&callback_lock, flags);
        return mask;
 }
@@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 /*
 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
 * mem_hardwall ancestor to the specified cpuset.  Call holding
- * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
+ * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
 * (an unusual configuration), then returns the root cpuset.
 */
 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 }
 /**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
@@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 * flag, yes.
 * Otherwise, no.
 *
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
- * cpusets, and never sleeps.
- *
 * The __GFP_THISNODE placement logic is really handled elsewhere,
 * by forcibly using a zonelist starting at a specified node, and by
 * (in get_page_from_freelist()) refusing to consider the zones for
@@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest enclosing hardwalled ancestor cpuset.
 *
- * Scanning up parent cpusets requires callback_mutex.  The
+ * Scanning up parent cpusets requires callback_lock.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
 * current tasks mems_allowed came up empty on the first pass over
 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_mutex
+ * cpuset are short of memory, might require taking the callback_lock.
- * mutex.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 *      TIF_MEMDIE   - any node ok
 *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
- *
- * Rule:
- *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
- *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
- *    the code that might scan up ancestor cpusets and sleep.
 */
-int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
        struct cpuset *cs;              /* current cpuset ancestors */
        int allowed;                    /* is allocation in zone z allowed? */
+        unsigned long flags;
        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
                return 1;
-        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
        if (node_isset(node, current->mems_allowed))
                return 1;
        /*
@@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
                return 1;
        /* Not hardwall and node outside mems_allowed: scan up cpusets */
-        mutex_lock(&callback_mutex);
+        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        cs = nearest_hardwall_ancestor(task_cs(current));
        allowed = node_isset(node, cs->mems_allowed);
        rcu_read_unlock();
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irqrestore(&callback_lock, flags);
        return allowed;
 }
-/*
- * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
- * set, yes, we can always allocate.  If node is in our task's mems_allowed,
- * yes.  If the task has been OOM killed and has access to memory reserves as
- * specified by the TIF_MEMDIE flag, yes.
- * Otherwise, no.
- *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first.  By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
- * Unlike the cpuset_node_allowed_softwall() variant, above,
- * this variant requires that the node be in the current task's
- * mems_allowed or that we're in interrupt.  It does not scan up the
- * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
- * It never sleeps.
- */
-int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
-        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
-                return 1;
-        if (node_isset(node, current->mems_allowed))
-                return 1;
-        /*
-         * Allow tasks that have access to memory reserves because they have
-         * been OOM killed to get memory anywhere.
-         */
-        if (unlikely(test_thread_flag(TIF_MEMDIE)))
-                return 1;
-        return 0;
-}
 /**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
 * version 2. This program is licensed "as is" without any warranty of any
 * kind, whether express or implied.
 */
+#define pr_fmt(fmt) "KGDB: " fmt
 #include <linux/pid_namespace.h>
 #include <linux/clocksource.h>
 #include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
                return err;
        err = kgdb_arch_remove_breakpoint(&tmp);
        if (err)
-                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+                pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
-                   "memory destroyed at: %lx", addr);
+                       addr);
        return err;
 }
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
                error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
                if (error) {
                        ret = error;
-                        printk(KERN_INFO "KGDB: BP install failed: %lx",
+                        pr_info("BP install failed: %lx\n",
-                               kgdb_break[i].bpt_addr);
+                                kgdb_break[i].bpt_addr);
                        continue;
                }
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
                        continue;
                error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
                if (error) {
-                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+                        pr_info("BP remove failed: %lx\n",
-                               kgdb_break[i].bpt_addr);
+                                kgdb_break[i].bpt_addr);
                        ret = error;
                }
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
                        goto setundefined;
                error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
                if (error)
-                        printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+                        pr_err("breakpoint remove failed: %lx\n",
                               kgdb_break[i].bpt_addr);
 setundefined:
                kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
        if (print_wait) {
 #ifdef CONFIG_KGDB_KDB
                if (!dbg_kdb_mode)
-                        printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+                        pr_crit("waiting... or $3#33 for KDB\n");
 #else
-                printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+                pr_crit("Waiting for remote debugger\n");
 #endif
        }
        return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
                exception_level = 0;
                kgdb_skipexception(ks->ex_vector, ks->linux_regs);
                dbg_activate_sw_breakpoints();
-                printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+                pr_crit("re-enter error: breakpoint removed %lx\n", addr);
-                        addr);
                WARN_ON_ONCE(1);
                return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
                panic("Recursive entry to debugger");
        }
-        printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+        pr_crit("re-enter exception: ALL breakpoints killed\n");
 #ifdef CONFIG_KGDB_KDB
        /* Allow kdb to debug itself one level */
        return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
        int cpu;
        int trace_on = 0;
        int online_cpus = num_online_cpus();
+        u64 time_left;
        kgdb_info[ks->cpu].enter_kgdb++;
        kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
        /*
         * Wait for the other CPUs to be notified and be waiting for us:
         */
-        while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
+        time_left = loops_per_jiffy * HZ;
-                                atomic_read(&slaves_in_kgdb)) != online_cpus)
+        while (kgdb_do_roundup && --time_left &&
+               (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+                   online_cpus)
                cpu_relax();
+        if (!time_left)
+                pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
        /*
         * At this point the primary processor is completely
@@ -795,15 +802,15 @@ static struct console kgdbcons = {
 static void sysrq_handle_dbg(int key)
 {
        if (!dbg_io_ops) {
-                printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+                pr_crit("ERROR: No KGDB I/O module available\n");
                return;
        }
        if (!kgdb_connected) {
 #ifdef CONFIG_KGDB_KDB
                if (!dbg_kdb_mode)
-                        printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+                        pr_crit("KGDB or $3#33 for KDB\n");
 #else
-                printk(KERN_CRIT "Entering KGDB\n");
+                pr_crit("Entering KGDB\n");
 #endif
        }
@@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void)
 {
        kgdb_break_asap = 0;
-        printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+        pr_crit("Waiting for connection from remote gdb...\n");
        kgdb_breakpoint();
 }
@@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
        if (dbg_io_ops) {
                spin_unlock(&kgdb_registration_lock);
-                printk(KERN_ERR "kgdb: Another I/O driver is already "
+                pr_err("Another I/O driver is already registered with KGDB\n");
-                                "registered with KGDB.\n");
                return -EBUSY;
        }
@@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
        spin_unlock(&kgdb_registration_lock);
-        printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+        pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
-               new_dbg_io_ops->name);
        /* Arm KGDB now. */
        kgdb_register_callbacks();
@@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
        spin_unlock(&kgdb_registration_lock);
-        printk(KERN_INFO
+        pr_info("Unregistered I/O driver %s, debugger disabled\n",
-                "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
                old_dbg_io_ops->name);
 }
 EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index b20d544f20c2..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
                bp->bp_free = 1;
-        kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
+        kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
-                "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+                "Set/Display breakpoints", 0,
-        kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
+                KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
-                "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
+                "Display breakpoints", 0,
+                KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
        if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
-                kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
+                kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
-                "[datar [length]|dataw [length]]   Set hw brk", 0, KDB_REPEAT_NO_ARGS);
+                "[datar [length]|dataw [length]]   Set hw brk", 0,
-        kdb_register_repeat("bc", kdb_bc, "<bpnum>",
+                KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
-                "Clear Breakpoint", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("bc", kdb_bc, "<bpnum>",
-        kdb_register_repeat("be", kdb_bc, "<bpnum>",
+                "Clear Breakpoint", 0,
-                "Enable Breakpoint", 0, KDB_REPEAT_NONE);
+                KDB_ENABLE_FLOW_CTRL);
-        kdb_register_repeat("bd", kdb_bc, "<bpnum>",
+        kdb_register_flags("be", kdb_bc, "<bpnum>",
-                "Disable Breakpoint", 0, KDB_REPEAT_NONE);
+                "Enable Breakpoint", 0,
+                KDB_ENABLE_FLOW_CTRL);
-        kdb_register_repeat("ss", kdb_ss, "",
+        kdb_register_flags("bd", kdb_bc, "<bpnum>",
-                "Single Step", 1, KDB_REPEAT_NO_ARGS);
+                "Disable Breakpoint", 0,
+                KDB_ENABLE_FLOW_CTRL);
+        kdb_register_flags("ss", kdb_ss, "",
+                "Single Step", 1,
+                KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
        /*
         * Architecture dependent initialization.
         */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
                ks->pass_exception = 1;
                KDB_FLAG_SET(CATASTROPHIC);
        }
+        /* set CATASTROPHIC if the system contains unresponsive processors */
+        for_each_online_cpu(i)
+                if (!kgdb_info[i].enter_kgdb)
+                        KDB_FLAG_SET(CATASTROPHIC);
        if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
                KDB_STATE_CLEAR(SSBPT);
                KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..7b40c5f07dce 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
 */
 #include <linux/ctype.h>
+#include <linux/types.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
 #include <linux/vmalloc.h>
 #include <linux/atomic.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
@@ -42,6 +44,12 @@
 #include <linux/slab.h>
 #include "kdb_private.h"
+#undef  MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "kdb."
+static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
+module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
 #define GREP_LEN 256
 char kdb_grep_string[GREP_LEN];
 int kdb_grepping_flag;
@@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = {
        KDBMSG(BADLENGTH, "Invalid length field"),
        KDBMSG(NOBP, "No Breakpoint exists"),
        KDBMSG(BADADDR, "Invalid address"),
+        KDBMSG(NOPERM, "Permission denied"),
 };
 #undef KDBMSG
@@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu)
 }
 /*
+ * Check whether the flags of the current command and the permissions
+ * of the kdb console has allow a command to be run.
+ */
+static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+                                   bool no_args)
+{
+        /* permissions comes from userspace so needs massaging slightly */
+        permissions &= KDB_ENABLE_MASK;
+        permissions |= KDB_ENABLE_ALWAYS_SAFE;
+        /* some commands change group when launched with no arguments */
+        if (no_args)
+                permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
+        flags |= KDB_ENABLE_ALL;
+        return permissions & flags;
+}
+/*
 * kdbgetenv - This function will return the character string value of
 *      an environment variable.
 * Parameters:
@@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
        kdb_symtab_t symtab;
        /*
+         * If the enable flags prohibit both arbitrary memory access
+         * and flow control then there are no reasonable grounds to
+         * provide symbol lookup.
+         */
+        if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
+                             kdb_cmd_enabled, false))
+                return KDB_NOPERM;
+        /*
         * Process arguments which follow the following syntax:
         *
         *  symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
                if (!s->count)
                        s->usable = 0;
                if (s->usable)
-                        kdb_register(s->name, kdb_exec_defcmd,
+                        /* macros are always safe because when executed each
-                                     s->usage, s->help, 0);
+                         * internal command re-enters kdb_parse() and is
+                         * safety checked individually.
+                         */
+                        kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
+                                           s->help, 0,
+                                           KDB_ENABLE_ALWAYS_SAFE);
                return 0;
        }
        if (!s->usable)
@@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr)
        if (i < kdb_max_commands) {
                int result;
+                if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+                        return KDB_NOPERM;
                KDB_STATE_SET(CMD);
                result = (*tp->cmd_func)(argc-1, (const char **)argv);
                if (result && ignore_errors && result > KDB_CMD_GO)
                        result = 0;
                KDB_STATE_CLEAR(CMD);
-                switch (tp->cmd_repeat) {
-                case KDB_REPEAT_NONE:
+                if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
-                        argc = 0;
+                        return result;
-                        if (argv[0])
-                                *(argv[0]) = '\0';
+                argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
-                        break;
+                if (argv[argc])
-                case KDB_REPEAT_NO_ARGS:
+                        *(argv[argc]) = '\0';
-                        argc = 1;
-                        if (argv[1])
-                                *(argv[1]) = '\0';
-                        break;
-                case KDB_REPEAT_WITH_ARGS:
-                        break;
-                }
                return result;
        }
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
 */
 static int kdb_sr(int argc, const char **argv)
 {
+        bool check_mask =
+            !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
        if (argc != 1)
                return KDB_ARGCOUNT;
        kdb_trap_printk++;
-        __handle_sysrq(*argv[1], false);
+        __handle_sysrq(*argv[1], check_mask);
        kdb_trap_printk--;
        return 0;
@@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv)
                kdb_printf("%-20s%8u  0x%p ", mod->name,
                           mod->core_size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
-                kdb_printf("%4ld ", module_refcount(mod));
+                kdb_printf("%4d ", module_refcount(mod));
 #endif
                if (mod->state == MODULE_STATE_GOING)
                        kdb_printf(" (Unloading)");
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
        for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
                if (!cpu_online(i)) {
                        state = 'F';    /* cpu is offline */
+                } else if (!kgdb_info[i].enter_kgdb) {
+                        state = 'D';    /* cpu is online but unresponsive */
                } else {
                        state = ' ';    /* cpu is responding to kdb */
                        if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
        /*
         * Validate cpunum
         */
-        if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+        if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
                return KDB_BADCPUNUM;
        dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
                        return 0;
                if (!kt->cmd_name)
                        continue;
+                if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+                        continue;
                if (strlen(kt->cmd_usage) > 20)
                        space = "\n                                    ";
                kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
 }
 /*
- * kdb_register_repeat - This function is used to register a kernel
+ * kdb_register_flags - This function is used to register a kernel
 *      debugger command.
 * Inputs:
 *      cmd     Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
 *      zero for success, one if a duplicate command.
 */
 #define kdb_command_extend 50   /* arbitrary */
-int kdb_register_repeat(char *cmd,
+int kdb_register_flags(char *cmd,
-                        kdb_func_t func,
+                       kdb_func_t func,
-                        char *usage,
+                       char *usage,
-                        char *help,
+                       char *help,
-                        short minlen,
+                       short minlen,
-                        kdb_repeat_t repeat)
+                       kdb_cmdflags_t flags)
 {
        int i;
        kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
        kp->cmd_func   = func;
        kp->cmd_usage  = usage;
        kp->cmd_help   = help;
-        kp->cmd_flags  = 0;
        kp->cmd_minlen = minlen;
-        kp->cmd_repeat = repeat;
+        kp->cmd_flags  = flags;
        return 0;
 }
-EXPORT_SYMBOL_GPL(kdb_register_repeat);
+EXPORT_SYMBOL_GPL(kdb_register_flags);
 /*
 * kdb_register - Compatibility register function for commands that do
 *      not need to specify a repeat state.  Equivalent to
- *      kdb_register_repeat with KDB_REPEAT_NONE.
+ *      kdb_register_flags with flags set to 0.
 * Inputs:
 *      cmd     Command name
 *      func    Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
             char *help,
             short minlen)
 {
-        return kdb_register_repeat(cmd, func, usage, help, minlen,
+        return kdb_register_flags(cmd, func, usage, help, minlen, 0);
-                                   KDB_REPEAT_NONE);
 }
 EXPORT_SYMBOL_GPL(kdb_register);
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
        for_each_kdbcmd(kp, i)
                kp->cmd_name = NULL;
-        kdb_register_repeat("md", kdb_md, "<vaddr>",
+        kdb_register_flags("md", kdb_md, "<vaddr>",
          "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
-                            KDB_REPEAT_NO_ARGS);
+          KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
-        kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
+        kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
-          "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
+          "Display Raw Memory", 0,
-        kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
+          KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
-          "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
-        kdb_register_repeat("mds", kdb_md, "<vaddr>",
+          "Display Physical Memory", 0,
-          "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
+          KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
-        kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
+        kdb_register_flags("mds", kdb_md, "<vaddr>",
-          "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
+          "Display Memory Symbolically", 0,
-        kdb_register_repeat("go", kdb_go, "[<vaddr>]",
+          KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
-          "Continue Execution", 1, KDB_REPEAT_NONE);
+        kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
-        kdb_register_repeat("rd", kdb_rd, "",
+          "Modify Memory Contents", 0,
-          "Display Registers", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
-        kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
+        kdb_register_flags("go", kdb_go, "[<vaddr>]",
-          "Modify Registers", 0, KDB_REPEAT_NONE);
+          "Continue Execution", 1,
-        kdb_register_repeat("ef", kdb_ef, "<vaddr>",
+          KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
-          "Display exception frame", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("rd", kdb_rd, "",
-        kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
+          "Display Registers", 0,
-          "Stack traceback", 1, KDB_REPEAT_NONE);
+          KDB_ENABLE_REG_READ);
-        kdb_register_repeat("btp", kdb_bt, "<pid>",
+        kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
-          "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
+          "Modify Registers", 0,
-        kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+          KDB_ENABLE_REG_WRITE);
-          "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("ef", kdb_ef, "<vaddr>",
-        kdb_register_repeat("btc", kdb_bt, "",
+          "Display exception frame", 0,
-          "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_MEM_READ);
-        kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+        kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
+          "Stack traceback", 1,
+          KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+        kdb_register_flags("btp", kdb_bt, "<pid>",
+          "Display stack for process <pid>", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+          "Backtrace all processes matching state flag", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("btc", kdb_bt, "",
+          "Backtrace current process on each cpu", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("btt", kdb_bt, "<vaddr>",
          "Backtrace process given its struct task address", 0,
-                            KDB_REPEAT_NONE);
+          KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
-        kdb_register_repeat("env", kdb_env, "",
+        kdb_register_flags("env", kdb_env, "",
-          "Show environment variables", 0, KDB_REPEAT_NONE);
+          "Show environment variables", 0,
-        kdb_register_repeat("set", kdb_set, "",
+          KDB_ENABLE_ALWAYS_SAFE);
-          "Set environment variables", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("set", kdb_set, "",
-        kdb_register_repeat("help", kdb_help, "",
+          "Set environment variables", 0,
-          "Display Help Message", 1, KDB_REPEAT_NONE);
+          KDB_ENABLE_ALWAYS_SAFE);
-        kdb_register_repeat("?", kdb_help, "",
+        kdb_register_flags("help", kdb_help, "",
-          "Display Help Message", 0, KDB_REPEAT_NONE);
+          "Display Help Message", 1,
-        kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
+          KDB_ENABLE_ALWAYS_SAFE);
-          "Switch to new cpu", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("?", kdb_help, "",
-        kdb_register_repeat("kgdb", kdb_kgdb, "",
+          "Display Help Message", 0,
-          "Enter kgdb mode", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_ALWAYS_SAFE);
-        kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
+        kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
-          "Display active task list", 0, KDB_REPEAT_NONE);
+          "Switch to new cpu", 0,
-        kdb_register_repeat("pid", kdb_pid, "<pidnum>",
+          KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
-          "Switch to another task", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("kgdb", kdb_kgdb, "",
-        kdb_register_repeat("reboot", kdb_reboot, "",
+          "Enter kgdb mode", 0, 0);
-          "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
+          "Display active task list", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("pid", kdb_pid, "<pidnum>",
+          "Switch to another task", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("reboot", kdb_reboot, "",
+          "Reboot the machine immediately", 0,
+          KDB_ENABLE_REBOOT);
 #if defined(CONFIG_MODULES)
-        kdb_register_repeat("lsmod", kdb_lsmod, "",
+        kdb_register_flags("lsmod", kdb_lsmod, "",
-          "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+          "List loaded kernel modules", 0,
+          KDB_ENABLE_INSPECT);
 #endif
 #if defined(CONFIG_MAGIC_SYSRQ)
-        kdb_register_repeat("sr", kdb_sr, "<key>",
+        kdb_register_flags("sr", kdb_sr, "<key>",
-          "Magic SysRq key", 0, KDB_REPEAT_NONE);
+          "Magic SysRq key", 0,
+          KDB_ENABLE_ALWAYS_SAFE);
 #endif
 #if defined(CONFIG_PRINTK)
-        kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
+        kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
-          "Display syslog buffer", 0, KDB_REPEAT_NONE);
+          "Display syslog buffer", 0,
+          KDB_ENABLE_ALWAYS_SAFE);
 #endif
        if (arch_kgdb_ops.enable_nmi) {
-                kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+                kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
-                  "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+                  "Disable NMI entry to KDB", 0,
-        }
+                  KDB_ENABLE_ALWAYS_SAFE);
-        kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+        }
-          "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
-        kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
+          "Define a set of commands, down to endefcmd", 0,
-          "Send a signal to a process", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_ALWAYS_SAFE);
-        kdb_register_repeat("summary", kdb_summary, "",
+        kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
-          "Summarize the system", 4, KDB_REPEAT_NONE);
+          "Send a signal to a process", 0,
-        kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+          KDB_ENABLE_SIGNAL);
-          "Display per_cpu variables", 3, KDB_REPEAT_NONE);
+        kdb_register_flags("summary", kdb_summary, "",
-        kdb_register_repeat("grephelp", kdb_grep_help, "",
+          "Summarize the system", 4,
-          "Display help on | grep", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_ALWAYS_SAFE);
+        kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+          "Display per_cpu variables", 3,
+          KDB_ENABLE_MEM_READ);
+        kdb_register_flags("grephelp", kdb_grep_help, "",
+          "Display help on | grep", 0,
+          KDB_ENABLE_ALWAYS_SAFE);
 }
 /* Execute any commands defined in kdb_cmds.  */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..eaacd1693954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
        kdb_func_t cmd_func;            /* Function to execute command */
        char    *cmd_usage;             /* Usage String for this command */
        char    *cmd_help;              /* Help message for this command */
-        short    cmd_flags;             /* Parsing flags */
        short    cmd_minlen;            /* Minimum legal # command
                                         * chars required */
-        kdb_repeat_t cmd_repeat;        /* Does command auto repeat on enter? */
+        kdb_cmdflags_t cmd_flags;       /* Command behaviour flags */
 } kdbtab_t;
 extern int kdb_bt(int, const char **);  /* KDB display back trace */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e19d3ebc29c..882f835a0d85 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -614,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
        if (!f.file)
                return -EBADF;
-        css = css_tryget_online_from_dir(f.file->f_dentry,
+        css = css_tryget_online_from_dir(f.file->f_path.dentry,
                                         &perf_event_cgrp_subsys);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
@@ -4461,18 +4461,14 @@ perf_output_sample_regs(struct perf_output_handle *handle,
 }
 static void perf_sample_regs_user(struct perf_regs *regs_user,
-                                  struct pt_regs *regs)
+                                  struct pt_regs *regs,
+                                  struct pt_regs *regs_user_copy)
 {
-        if (!user_mode(regs)) {
+        if (user_mode(regs)) {
-                if (current->mm)
+                regs_user->abi = perf_reg_abi(current);
-                        regs = task_pt_regs(current);
-                else
-                        regs = NULL;
-        }
-        if (regs) {
-                regs_user->abi  = perf_reg_abi(current);
                regs_user->regs = regs;
+        } else if (current->mm) {
+                perf_get_regs_user(regs_user, regs, regs_user_copy);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
@@ -4951,7 +4947,8 @@ void perf_prepare_sample(struct perf_event_header *header,
        }
        if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
-                perf_sample_regs_user(&data->regs_user, regs);
+                perf_sample_regs_user(&data->regs_user, regs,
+                                      &data->regs_user_copy);
        if (sample_type & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
@@ -7477,11 +7474,11 @@ SYSCALL_DEFINE5(perf_event_open,
        if (move_group) {
                synchronize_rcu();
-                perf_install_in_context(ctx, group_leader, event->cpu);
+                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                        perf_install_in_context(ctx, sibling, event->cpu);
+                        perf_install_in_context(ctx, sibling, sibling->cpu);
                        get_ctx(ctx);
                }
        }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ed8f2cde34c5..cb346f26a22d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        }
        flush_cache_page(vma, addr, pte_pfn(*ptep));
-        ptep_clear_flush(vma, addr, ptep);
+        ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
        page_remove_rmap(page);
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
        int more = 0;
 again:
-        mutex_lock(&mapping->i_mmap_mutex);
+        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;
                if (!prev && !more) {
                        /*
-                         * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+                         * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
                         * reclaim. This is optimistic, no harm done if it fails.
                         */
                        prev = kmalloc(sizeof(struct map_info),
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
                info->mm = vma->vm_mm;
                info->vaddr = offset_to_vaddr(vma, offset);
        }
-        mutex_unlock(&mapping->i_mmap_mutex);
+        i_mmap_unlock_read(mapping);
        if (!more)
                goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 232c4bc8bcc9..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk)
        }
        /*
-         * Accumulate here the counters for all threads but the group leader
+         * Accumulate here the counters for all threads as they die. We could
-         * as they die, so they can be added into the process-wide totals
+         * skip the group leader because it is the last user of signal_struct,
-         * when those are taken.  The group leader stays around as a zombie as
+         * but we want to avoid the race with thread_group_cputime() which can
-         * long as there are other threads.  When it gets reaped, the exit.c
+         * see the empty ->thread_head list.
-         * code will add its counts into these totals.  We won't ever get here
-         * for the group leader, since it will have been the last reference on
-         * the signal_struct.
         */
        task_cputime(tsk, &utime, &stime);
        write_seqlock(&sig->stats_lock);
@@ -215,27 +212,6 @@ repeat:
 }
 /*
- * This checks not only the pgrp, but falls back on the pid if no
- * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
- * without this...
- *
- * The caller must hold rcu lock or the tasklist lock.
- */
-struct pid *session_of_pgrp(struct pid *pgrp)
-{
-        struct task_struct *p;
-        struct pid *sid = NULL;
-        p = pid_task(pgrp, PIDTYPE_PGID);
-        if (p == NULL)
-                p = pid_task(pgrp, PIDTYPE_PID);
-        if (p != NULL)
-                sid = task_session(p);
-        return sid;
-}
-/*
 * Determine if a process group is "orphaned", according to the POSIX
 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 * by terminal-generated stop signals.  Newly orphaned process groups are
@@ -462,6 +438,44 @@ static void exit_mm(struct task_struct *tsk)
        clear_thread_flag(TIF_MEMDIE);
 }
+static struct task_struct *find_alive_thread(struct task_struct *p)
+{
+        struct task_struct *t;
+        for_each_thread(p, t) {
+                if (!(t->flags & PF_EXITING))
+                        return t;
+        }
+        return NULL;
+}
+static struct task_struct *find_child_reaper(struct task_struct *father)
+        __releases(&tasklist_lock)
+        __acquires(&tasklist_lock)
+{
+        struct pid_namespace *pid_ns = task_active_pid_ns(father);
+        struct task_struct *reaper = pid_ns->child_reaper;
+        if (likely(reaper != father))
+                return reaper;
+        reaper = find_alive_thread(father);
+        if (reaper) {
+                pid_ns->child_reaper = reaper;
+                return reaper;
+        }
+        write_unlock_irq(&tasklist_lock);
+        if (unlikely(pid_ns == &init_pid_ns)) {
+                panic("Attempted to kill init! exitcode=0x%08x\n",
+                        father->signal->group_exit_code ?: father->exit_code);
+        }
+        zap_pid_ns_processes(pid_ns);
+        write_lock_irq(&tasklist_lock);
+        return father;
+}
 /*
 * When we die, we re-parent all our children, and try to:
 * 1. give them to another thread in our thread group, if such a member exists
@@ -469,58 +483,36 @@ static void exit_mm(struct task_struct *tsk)
 *    child_subreaper for its children (like a service manager)
 * 3. give it to the init process (PID 1) in our pid namespace
 */
-static struct task_struct *find_new_reaper(struct task_struct *father)
+static struct task_struct *find_new_reaper(struct task_struct *father,
-        __releases(&tasklist_lock)
+                                           struct task_struct *child_reaper)
-        __acquires(&tasklist_lock)
 {
-        struct pid_namespace *pid_ns = task_active_pid_ns(father);
+        struct task_struct *thread, *reaper;
-        struct task_struct *thread;
-        thread = father;
+        thread = find_alive_thread(father);
-        while_each_thread(father, thread) {
+        if (thread)
-                if (thread->flags & PF_EXITING)
-                        continue;
-                if (unlikely(pid_ns->child_reaper == father))
-                        pid_ns->child_reaper = thread;
                return thread;
-        }
-        if (unlikely(pid_ns->child_reaper == father)) {
-                write_unlock_irq(&tasklist_lock);
-                if (unlikely(pid_ns == &init_pid_ns)) {
-                        panic("Attempted to kill init! exitcode=0x%08x\n",
-                                father->signal->group_exit_code ?:
-                                        father->exit_code);
-                }
-                zap_pid_ns_processes(pid_ns);
-                write_lock_irq(&tasklist_lock);
-        } else if (father->signal->has_child_subreaper) {
-                struct task_struct *reaper;
+        if (father->signal->has_child_subreaper) {
                /*
-                 * Find the first ancestor marked as child_subreaper.
+                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
-                 * Note that the code below checks same_thread_group(reaper,
+                 * We start from father to ensure we can not look into another
-                 * pid_ns->child_reaper).  This is what we need to DTRT in a
+                 * namespace, this is safe because all its threads are dead.
-                 * PID namespace. However we still need the check above, see
-                 * http://marc.info/?l=linux-kernel&m=131385460420380
                 */
-                for (reaper = father->real_parent;
+                for (reaper = father;
-                     reaper != &init_task;
+                     !same_thread_group(reaper, child_reaper);
                     reaper = reaper->real_parent) {
-                        if (same_thread_group(reaper, pid_ns->child_reaper))
+                        /* call_usermodehelper() descendants need this check */
+                        if (reaper == &init_task)
                                break;
                        if (!reaper->signal->is_child_subreaper)
                                continue;
-                        thread = reaper;
+                        thread = find_alive_thread(reaper);
-                        do {
+                        if (thread)
-                                if (!(thread->flags & PF_EXITING))
+                                return thread;
-                                        return reaper;
-                        } while_each_thread(reaper, thread);
                }
        }
-        return pid_ns->child_reaper;
+        return child_reaper;
 }
 /*
@@ -529,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 static void reparent_leader(struct task_struct *father, struct task_struct *p,
                                struct list_head *dead)
 {
-        list_move_tail(&p->sibling, &p->real_parent->children);
+        if (unlikely(p->exit_state == EXIT_DEAD))
-        if (p->exit_state == EXIT_DEAD)
-                return;
-        /*
-         * If this is a threaded reparent there is no need to
-         * notify anyone anything has happened.
-         */
-        if (same_thread_group(p->real_parent, father))
                return;
        /* We don't want people slaying init. */
@@ -548,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
                if (do_notify_parent(p, p->exit_signal)) {
                        p->exit_state = EXIT_DEAD;
-                        list_move_tail(&p->sibling, dead);
+                        list_add(&p->ptrace_entry, dead);
                }
        }
        kill_orphaned_pgrp(p, father);
 }
-static void forget_original_parent(struct task_struct *father)
+/*
+ * This does two things:
+ *
+ * A.  Make init inherit all the child processes
+ * B.  Check to see if any process groups have become orphaned
+ *      as a result of our exiting, and if they have any stopped
+ *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+ */
+static void forget_original_parent(struct task_struct *father,
+                                        struct list_head *dead)
 {
-        struct task_struct *p, *n, *reaper;
+        struct task_struct *p, *t, *reaper;
-        LIST_HEAD(dead_children);
-        write_lock_irq(&tasklist_lock);
+        if (unlikely(!list_empty(&father->ptraced)))
-        /*
+                exit_ptrace(father, dead);
-         * Note that exit_ptrace() and find_new_reaper() might
-         * drop tasklist_lock and reacquire it.
-         */
-        exit_ptrace(father);
-        reaper = find_new_reaper(father);
-        list_for_each_entry_safe(p, n, &father->children, sibling) {
+        /* Can drop and reacquire tasklist_lock */
-                struct task_struct *t = p;
+        reaper = find_child_reaper(father);
+        if (list_empty(&father->children))
+                return;
-                do {
+        reaper = find_new_reaper(father, reaper);
+        list_for_each_entry(p, &father->children, sibling) {
+                for_each_thread(p, t) {
                        t->real_parent = reaper;
-                        if (t->parent == father) {
+                        BUG_ON((!t->ptrace) != (t->parent == father));
-                                BUG_ON(t->ptrace);
+                        if (likely(!t->ptrace))
                                t->parent = t->real_parent;
-                        }
                        if (t->pdeath_signal)
                                group_send_sig_info(t->pdeath_signal,
                                                    SEND_SIG_NOINFO, t);
-                } while_each_thread(p, t);
+                }
-                reparent_leader(father, p, &dead_children);
+                /*
-        }
+                 * If this is a threaded reparent there is no need to
-        write_unlock_irq(&tasklist_lock);
+                 * notify anyone anything has happened.
+                 */
-        BUG_ON(!list_empty(&father->children));
+                if (!same_thread_group(reaper, father))
+                        reparent_leader(father, p, dead);
-        list_for_each_entry_safe(p, n, &dead_children, sibling) {
-                list_del_init(&p->sibling);
-                release_task(p);
        }
+        list_splice_tail_init(&father->children, &reaper->children);
 }
 /*
@@ -600,18 +588,12 @@ static void forget_original_parent(struct task_struct *father)
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
        bool autoreap;
+        struct task_struct *p, *n;
-        /*
+        LIST_HEAD(dead);
-         * This does two things:
-         *
-         * A.  Make init inherit all the child processes
-         * B.  Check to see if any process groups have become orphaned
-         *      as a result of our exiting, and if they have any stopped
-         *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
-         */
-        forget_original_parent(tsk);
        write_lock_irq(&tasklist_lock);
+        forget_original_parent(tsk, &dead);
        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -629,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        }
        tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+        if (tsk->exit_state == EXIT_DEAD)
+                list_add(&tsk->ptrace_entry, &dead);
        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exit_task);
        write_unlock_irq(&tasklist_lock);
-        /* If the process is dead, release it - nobody will wait for it */
+        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
-        if (autoreap)
+                list_del_init(&p->ptrace_entry);
-                release_task(tsk);
+                release_task(p);
+        }
 }
 #ifdef CONFIG_DEBUG_STACK_USAGE
@@ -982,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 */
 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 {
-        unsigned long state;
+        int state, retval, status;
-        int retval, status, traced;
        pid_t pid = task_pid_vnr(p);
        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
        struct siginfo __user *infop;
@@ -1008,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                }
                return wait_noreap_copyout(wo, p, pid, uid, why, status);
        }
-        traced = ptrace_reparented(p);
        /*
         * Move the task's state to DEAD/TRACE, only one thread can do this.
         */
-        state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
+                EXIT_TRACE : EXIT_DEAD;
        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
                return 0;
        /*
-         * It can be ptraced but not reparented, check
+         * We own this thread, nobody else can reap it.
-         * thread_group_leader() to filter out sub-threads.
+         */
+        read_unlock(&tasklist_lock);
+        sched_annotate_sleep();
+        /*
+         * Check thread_group_leader() to exclude the traced sub-threads.
         */
-        if (likely(!traced) && thread_group_leader(p)) {
+        if (state == EXIT_DEAD && thread_group_leader(p)) {
-                struct signal_struct *psig;
+                struct signal_struct *sig = p->signal;
-                struct signal_struct *sig;
+                struct signal_struct *psig = current->signal;
                unsigned long maxrss;
                cputime_t tgutime, tgstime;
@@ -1034,21 +1022,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                 * accumulate in the parent's signal_struct c* fields.
                 *
                 * We don't bother to take a lock here to protect these
-                 * p->signal fields, because they are only touched by
+                 * p->signal fields because the whole thread group is dead
-                 * __exit_signal, which runs with tasklist_lock
+                 * and nobody can change them.
-                 * write-locked anyway, and so is excluded here.  We do
+                 *
-                 * need to protect the access to parent->signal fields,
+                 * psig->stats_lock also protects us from our sub-theads
-                 * as other threads in the parent group can be right
+                 * which can reap other children at the same time. Until
-                 * here reaping other children at the same time.
+                 * we change k_getrusage()-like users to rely on this lock
+                 * we have to take ->siglock as well.
                 *
                 * We use thread_group_cputime_adjusted() to get times for
                 * the thread group, which consolidates times for all threads
                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
-                spin_lock_irq(&p->real_parent->sighand->siglock);
+                spin_lock_irq(&current->sighand->siglock);
-                psig = p->real_parent->signal;
-                sig = p->signal;
                write_seqlock(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
@@ -1073,16 +1060,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
                write_sequnlock(&psig->stats_lock);
-                spin_unlock_irq(&p->real_parent->sighand->siglock);
+                spin_unlock_irq(&current->sighand->siglock);
        }
-        /*
-         * Now we are sure this task is interesting, and no other
-         * thread can reap it because we its state == DEAD/TRACE.
-         */
-        read_unlock(&tasklist_lock);
-        sched_annotate_sleep();
        retval = wo->wo_rusage
                ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
@@ -1307,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
                                struct task_struct *p)
 {
+        /*
+         * We can race with wait_task_zombie() from another thread.
+         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+         * can't confuse the checks below.
+         */
+        int exit_state = ACCESS_ONCE(p->exit_state);
        int ret;
-        if (unlikely(p->exit_state == EXIT_DEAD))
+        if (unlikely(exit_state == EXIT_DEAD))
                return 0;
        ret = eligible_child(wo, p);
@@ -1330,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
                return 0;
        }
-        if (unlikely(p->exit_state == EXIT_TRACE)) {
+        if (unlikely(exit_state == EXIT_TRACE)) {
                /*
                 * ptrace == 0 means we are the natural parent. In this case
                 * we should clear notask_error, debugger will notify us.
@@ -1357,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
        }
        /* slay zombie? */
-        if (p->exit_state == EXIT_ZOMBIE) {
+        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
 #include <linux/ftrace.h>
 #include <linux/memory.h>
 #include <linux/module.h>
+#include <linux/ftrace.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
                return 1;
        if (is_module_text_address(addr))
                return 1;
+        if (is_ftrace_trampoline(addr))
+                return 1;
        /*
         * There might be init symbols in saved stacktraces.
         * Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
 {
        if (core_kernel_text(addr))
                return 1;
-        return is_module_text_address(addr);
+        if (is_module_text_address(addr))
+                return 1;
+        return is_ftrace_trampoline(addr);
 }
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ca84189cfc2..4dc2ddade9f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                atomic_dec(&inode->i_writecount);
-                        mutex_lock(&mapping->i_mmap_mutex);
+                        i_mmap_lock_write(mapping);
                        if (tmp->vm_flags & VM_SHARED)
                                atomic_inc(&mapping->i_mmap_writable);
                        flush_dcache_mmap_lock(mapping);
@@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                vma_interval_tree_insert_after(tmp, mpnt,
                                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
-                        mutex_unlock(&mapping->i_mmap_mutex);
+                        i_mmap_unlock_write(mapping);
                }
                /*
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3b7408759bdf..c92e44855ddd 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
        Note that the debugfs filesystem has to be mounted to access
        profiling data.
+config ARCH_HAS_GCOV_PROFILE_ALL
+        def_bool n
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
+        depends on ARCH_HAS_GCOV_PROFILE_ALL
        default n
        ---help---
        This options activates profiling for the entire kernel.
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..664411f171b5 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 /* init to 2 - one for init_task, one to ensure it is never freed */
@@ -213,6 +214,14 @@ out:
        return i;
 }
+bool may_setgroups(void)
+{
+        struct user_namespace *user_ns = current_user_ns();
+        return ns_capable(user_ns, CAP_SETGID) &&
+                userns_may_setgroups(user_ns);
+}
 /*
 *      SMP: Our groups are copy-on-write. We can set them safely
 *      without another task interfering.
@@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!ns_capable(current_user_ns(), CAP_SETGID))
+        if (!may_setgroups())
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4332d766619d..df553b0af936 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
 #ifdef CONFIG_SPARSE_IRQ
 static inline void irq_mark_irq(unsigned int irq) { }
+extern void irq_lock_sparse(void);
+extern void irq_unlock_sparse(void);
 #else
 extern void irq_mark_irq(unsigned int irq);
+static inline void irq_lock_sparse(void) { }
+static inline void irq_unlock_sparse(void) { }
 #endif
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a1782f88f0af..99793b9b6d23 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -132,6 +132,16 @@ static void free_masks(struct irq_desc *desc)
 static inline void free_masks(struct irq_desc *desc) { }
 #endif
+void irq_lock_sparse(void)
+{
+        mutex_lock(&sparse_irq_lock);
+}
+void irq_unlock_sparse(void)
+{
+        mutex_unlock(&sparse_irq_lock);
+}
 static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
 {
        struct irq_desc *desc;
@@ -168,6 +178,12 @@ static void free_desc(unsigned int irq)
        unregister_irq_proc(irq, desc);
+        /*
+         * sparse_irq_lock protects also show_interrupts() and
+         * kstat_irq_usr(). Once we deleted the descriptor from the
+         * sparse tree we can free it. Access in proc will fail to
+         * lookup the descriptor.
+         */
        mutex_lock(&sparse_irq_lock);
        delete_irq_desc(irq);
        mutex_unlock(&sparse_irq_lock);
@@ -574,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq)
        kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
 }
+/**
+ * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
+ * @irq:        The interrupt number
+ * @cpu:        The cpu number
+ *
+ * Returns the sum of interrupt counts on @cpu since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        struct irq_desc *desc = irq_to_desc(irq);
@@ -582,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
                        *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
+/**
+ * kstat_irqs - Get the statistics for an interrupt
+ * @irq:        The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
 unsigned int kstat_irqs(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
@@ -594,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq)
                sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
        return sum;
 }
+/**
+ * kstat_irqs_usr - Get the statistics for an interrupt
+ * @irq:        The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. Contrary to kstat_irqs() this can be called from any
+ * preemptible context. It's protected against concurrent removal of
+ * an interrupt descriptor when sparse irqs are enabled.
+ */
+unsigned int kstat_irqs_usr(unsigned int irq)
+{
+        int sum;
+        irq_lock_sparse();
+        sum = kstat_irqs(irq);
+        irq_unlock_sparse();
+        return sum;
+}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ac1ba2f11032..9dc9bfd8a678 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -15,6 +15,23 @@
 #include "internals.h"
+/*
+ * Access rules:
+ *
+ * procfs protects read/write of /proc/irq/N/ files against a
+ * concurrent free of the interrupt descriptor. remove_proc_entry()
+ * immediately prevents new read/writes to happen and waits for
+ * already running read/write functions to complete.
+ *
+ * We remove the proc entries first and then delete the interrupt
+ * descriptor from the radix tree and free it. So it is guaranteed
+ * that irq_to_desc(N) is valid as long as the read/writes are
+ * permitted by procfs.
+ *
+ * The read from /proc/interrupts is a different problem because there
+ * is no protection. So the lookup and the access to irqdesc
+ * information must be protected by sparse_irq_lock.
+ */
 static struct proc_dir_entry *root_irq_dir;
 #ifdef CONFIG_SMP
@@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v)
                seq_putc(p, '\n');
        }
+        irq_lock_sparse();
        desc = irq_to_desc(i);
        if (!desc)
-                return 0;
+                goto outsparse;
        raw_spin_lock_irqsave(&desc->lock, flags);
        for_each_online_cpu(j)
@@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v)
        seq_putc(p, '\n');
 out:
        raw_spin_unlock_irqrestore(&desc->lock, flags);
+outsparse:
+        irq_unlock_sparse();
        return 0;
 }
 #endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 3ab9048483fa..cbf9fb899d92 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
 void irq_work_tick(void)
 {
-        struct llist_head *raised = &__get_cpu_var(raised_list);
+        struct llist_head *raised = this_cpu_ptr(&raised_list);
        if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
                irq_work_run_list(raised);
-        irq_work_run_list(&__get_cpu_var(lazy_list));
+        irq_work_run_list(this_cpu_ptr(&lazy_list));
 }
 /*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2abf9f6e9a61..9a8a01abbaed 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
        if (!kexec_on_panic) {
                image->swap_page = kimage_alloc_control_pages(image, 0);
                if (!image->swap_page) {
-                        pr_err(KERN_ERR "Could not allocate swap buffer\n");
+                        pr_err("Could not allocate swap buffer\n");
                        goto out_free_control_pages;
                }
        }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 80f7a6d00519..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
-/*
- * kmod_thread_locker is used for deadlock avoidance.  There is no explicit
- * locking to protect this global - it is private to the singleton khelper
- * thread and should only ever be modified by that thread.
- */
-static const struct task_struct *kmod_thread_locker;
 #define CAP_BSET        (void *)1
 #define CAP_PI          (void *)2
@@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info)
 static int ____call_usermodehelper(void *data)
 {
        struct subprocess_info *sub_info = data;
-        int wait = sub_info->wait & ~UMH_KILLABLE;
        struct cred *new;
        int retval;
@@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data)
 out:
        sub_info->retval = retval;
        /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
-        if (wait != UMH_WAIT_PROC)
+        if (!(sub_info->wait & UMH_WAIT_PROC))
                umh_complete(sub_info);
        if (!retval)
                return 0;
        do_exit(0);
 }
-static int call_helper(void *data)
-{
-        /* Worker thread started blocking khelper thread. */
-        kmod_thread_locker = current;
-        return ____call_usermodehelper(data);
-}
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-        int wait = sub_info->wait & ~UMH_KILLABLE;
        pid_t pid;
-        /* CLONE_VFORK: wait until the usermode helper has execve'd
+        if (sub_info->wait & UMH_WAIT_PROC)
-         * successfully We need the data structures to stay around
-         * until that is done.  */
-        if (wait == UMH_WAIT_PROC)
                pid = kernel_thread(wait_for_helper, sub_info,
                                    CLONE_FS | CLONE_FILES | SIGCHLD);
-        else {
+        else
-                pid = kernel_thread(call_helper, sub_info,
+                pid = kernel_thread(____call_usermodehelper, sub_info,
-                                    CLONE_VFORK | SIGCHLD);
+                                    SIGCHLD);
-                /* Worker thread stopped blocking khelper thread. */
-                kmod_thread_locker = NULL;
-        }
        if (pid < 0) {
                sub_info->retval = pid;
@@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                goto out;
        }
        /*
-         * Worker thread must not wait for khelper thread at below
-         * wait_for_completion() if the thread was created with CLONE_VFORK
-         * flag, for khelper thread is already waiting for the thread at
-         * wait_for_completion() in do_fork().
-         */
-        if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
-                retval = -EBUSY;
-                goto out;
-        }
-        /*
         * Set the completion pointer only if there is a waiter.
         * This makes it possible to use umh_complete to free
         * the data structure in case of UMH_NO_WAIT.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3995f546d0f3..ee619929cf90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
 static void free_insn_page(void *page)
 {
-        module_free(NULL, page);
+        module_memfree(page);
 }
 struct kprobe_insn_cache kprobe_insn_slots = {
@@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 #ifdef CONFIG_KPROBES_ON_FTRACE
 static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
-        .flags = FTRACE_OPS_FL_SAVE_REGS,
+        .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
 };
 static int kprobe_ftrace_enabled;
@@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
        return ret;
 }
-static int check_kprobe_address_safe(struct kprobe *p,
+int __weak arch_check_ftrace_location(struct kprobe *p)
-                                     struct module **probed_mod)
 {
-        int ret = 0;
        unsigned long ftrace_addr;
-        /*
-         * If the address is located on a ftrace nop, set the
-         * breakpoint to the following instruction.
-         */
        ftrace_addr = ftrace_location((unsigned long)p->addr);
        if (ftrace_addr) {
 #ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
                return -EINVAL;
 #endif
        }
+        return 0;
+}
+static int check_kprobe_address_safe(struct kprobe *p,
+                                     struct module **probed_mod)
+{
+        int ret;
+        ret = arch_check_ftrace_location(p);
+        if (ret)
+                return ret;
        jump_label_lock();
        preempt_disable();
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
                        DEBUG_LOCKS_WARN_ON(lock->owner != current);
                DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-                mutex_clear_owner(lock);
        }
        /*
         * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
         * mutexes so that we can do it here after we've verified state.
         */
+        mutex_clear_owner(lock);
        atomic_set(&lock->count, 1);
 }
diff --git a/kernel/module.c b/kernel/module.c
index e52a8739361a..d856e96a3cce 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
 #include <linux/vermagic.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
-#include <linux/stop_machine.h>
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/mutex.h>
@@ -98,7 +97,7 @@
 * 1) List of modules (also safely readable with preempt_disable),
 * 2) module_use links,
 * 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete and add uses RCU list operations). */
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
@@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 * Protected by module_mutex. */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-int register_module_notifier(struct notifier_block * nb)
+int register_module_notifier(struct notifier_block *nb)
 {
        return blocking_notifier_chain_register(&module_notify_list, nb);
 }
 EXPORT_SYMBOL(register_module_notifier);
-int unregister_module_notifier(struct notifier_block * nb)
+int unregister_module_notifier(struct notifier_block *nb)
 {
        return blocking_notifier_chain_unregister(&module_notify_list, nb);
 }
@@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
 EXPORT_TRACEPOINT_SYMBOL(module_get);
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
 /* Init the unload section of the module. */
 static int module_unload_init(struct module *mod)
 {
-        mod->refptr = alloc_percpu(struct module_ref);
+        /*
-        if (!mod->refptr)
+         * Initialize reference counter to MODULE_REF_BASE.
-                return -ENOMEM;
+         * refcnt == 0 means module is going.
+         */
+        atomic_set(&mod->refcnt, MODULE_REF_BASE);
        INIT_LIST_HEAD(&mod->source_list);
        INIT_LIST_HEAD(&mod->target_list);
        /* Hold reference count during initialization. */
-        raw_cpu_write(mod->refptr->incs, 1);
+        atomic_inc(&mod->refcnt);
        return 0;
 }
@@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod)
                kfree(use);
        }
        mutex_unlock(&module_mutex);
-        free_percpu(mod->refptr);
 }
 #ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -740,60 +742,48 @@ static inline int try_force_unload(unsigned int flags)
 }
 #endif /* CONFIG_MODULE_FORCE_UNLOAD */
-struct stopref
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
 {
-        struct module *mod;
+        int ret;
-        int flags;
-        int *forced;
-};
-/* Whole machine is stopped with interrupts off when this runs. */
+        /* Try to decrement refcnt which we set at loading */
-static int __try_stop_module(void *_sref)
+        ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
-{
+        BUG_ON(ret < 0);
-        struct stopref *sref = _sref;
+        if (ret)
+                /* Someone can put this right now, recover with checking */
+                ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+        return ret;
+}
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
        /* If it's not unused, quit unless we're forcing. */
-        if (module_refcount(sref->mod) != 0) {
+        if (try_release_module_ref(mod) != 0) {
-                if (!(*sref->forced = try_force_unload(sref->flags)))
+                *forced = try_force_unload(flags);
+                if (!(*forced))
                        return -EWOULDBLOCK;
        }
        /* Mark it as dying. */
-        sref->mod->state = MODULE_STATE_GOING;
+        mod->state = MODULE_STATE_GOING;
-        return 0;
-}
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
-        struct stopref sref = { mod, flags, forced };
-        return stop_machine(__try_stop_module, &sref, NULL);
+        return 0;
 }
-unsigned long module_refcount(struct module *mod)
+/**
+ * module_refcount - return the refcount or -1 if unloading
+ *
+ * @mod:        the module we're checking
+ *
+ * Returns:
+ *      -1 if the module is in the process of unloading
+ *      otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
 {
-        unsigned long incs = 0, decs = 0;
+        return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
-        int cpu;
-        for_each_possible_cpu(cpu)
-                decs += per_cpu_ptr(mod->refptr, cpu)->decs;
-        /*
-         * ensure the incs are added up after the decs.
-         * module_put ensures incs are visible before decs with smp_wmb.
-         *
-         * This 2-count scheme avoids the situation where the refcount
-         * for CPU0 is read, then CPU0 increments the module refcount,
-         * then CPU1 drops that refcount, then the refcount for CPU1 is
-         * read. We would record a decrement but not its corresponding
-         * increment so we would see a low count (disaster).
-         *
-         * Rare situation? But module_refcount can be preempted, and we
-         * might be tallying up 4096+ CPUs. So it is not impossible.
-         */
-        smp_rmb();
-        for_each_possible_cpu(cpu)
-                incs += per_cpu_ptr(mod->refptr, cpu)->incs;
-        return incs - decs;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -875,10 +865,12 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
        struct module_use *use;
        int printed_something = 0;
-        seq_printf(m, " %lu ", module_refcount(mod));
+        seq_printf(m, " %i ", module_refcount(mod));
-        /* Always include a trailing , so userspace can differentiate
+        /*
-           between this and the old multi-field proc format. */
+         * Always include a trailing , so userspace can differentiate
+         * between this and the old multi-field proc format.
+         */
        list_for_each_entry(use, &mod->source_list, source_list) {
                printed_something = 1;
                seq_printf(m, "%s,", use->source->name);
@@ -886,11 +878,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
        if (mod->init != NULL && mod->exit == NULL) {
                printed_something = 1;
-                seq_printf(m, "[permanent],");
+                seq_puts(m, "[permanent],");
        }
        if (!printed_something)
-                seq_printf(m, "-");
+                seq_puts(m, "-");
 }
 void __symbol_put(const char *symbol)
@@ -925,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
                           struct module_kobject *mk, char *buffer)
 {
-        return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
+        return sprintf(buffer, "%i\n", module_refcount(mk->mod));
 }
 static struct module_attribute modinfo_refcnt =
@@ -935,7 +927,7 @@ void __module_get(struct module *module)
 {
        if (module) {
                preempt_disable();
-                __this_cpu_inc(module->refptr->incs);
+                atomic_inc(&module->refcnt);
                trace_module_get(module, _RET_IP_);
                preempt_enable();
        }
@@ -948,11 +940,11 @@ bool try_module_get(struct module *module)
        if (module) {
                preempt_disable();
+                /* Note: here, we can fail to get a reference */
-                if (likely(module_is_live(module))) {
+                if (likely(module_is_live(module) &&
-                        __this_cpu_inc(module->refptr->incs);
+                           atomic_inc_not_zero(&module->refcnt) != 0))
                        trace_module_get(module, _RET_IP_);
-                } else
+                else
                        ret = false;
                preempt_enable();
@@ -963,11 +955,12 @@ EXPORT_SYMBOL(try_module_get);
 void module_put(struct module *module)
 {
+        int ret;
        if (module) {
                preempt_disable();
-                smp_wmb(); /* see comment in module_refcount */
+                ret = atomic_dec_if_positive(&module->refcnt);
-                __this_cpu_inc(module->refptr->decs);
+                WARN_ON(ret < 0);       /* Failed to put refcount */
                trace_module_put(module, _RET_IP_);
                preempt_enable();
        }
@@ -978,7 +971,7 @@ EXPORT_SYMBOL(module_put);
 static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
        /* We don't know the usage count, or what modules are using. */
-        seq_printf(m, " - -");
+        seq_puts(m, " - -");
 }
 static inline void module_unload_free(struct module *mod)
@@ -1131,7 +1124,7 @@ static unsigned long maybe_relocated(unsigned long crc,
 static int check_version(Elf_Shdr *sechdrs,
                         unsigned int versindex,
                         const char *symname,
-                         struct module *mod, 
+                         struct module *mod,
                         const unsigned long *crc,
                         const struct module *crc_owner)
 {
@@ -1165,7 +1158,7 @@ static int check_version(Elf_Shdr *sechdrs,
        return 0;
 bad_version:
-        printk("%s: disagrees about version of symbol %s\n",
+        pr_warn("%s: disagrees about version of symbol %s\n",
               mod->name, symname);
        return 0;
 }
@@ -1200,7 +1193,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 static inline int check_version(Elf_Shdr *sechdrs,
                                unsigned int versindex,
                                const char *symname,
-                                struct module *mod, 
+                                struct module *mod,
                                const unsigned long *crc,
                                const struct module *crc_owner)
 {
@@ -1288,15 +1281,13 @@ static inline bool sect_empty(const Elf_Shdr *sect)
        return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
 }
-struct module_sect_attr
+struct module_sect_attr {
-{
        struct module_attribute mattr;
        char *name;
        unsigned long address;
 };
-struct module_sect_attrs
+struct module_sect_attrs {
-{
        struct attribute_group grp;
        unsigned int nsections;
        struct module_sect_attr attrs[0];
@@ -1550,7 +1541,8 @@ static int module_add_modinfo_attrs(struct module *mod)
                    (attr->test && attr->test(mod))) {
                        memcpy(temp_attr, attr, sizeof(*temp_attr));
                        sysfs_attr_init(&temp_attr->attr);
-                        error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+                        error = sysfs_create_file(&mod->mkobj.kobj,
+                                        &temp_attr->attr);
                        ++temp_attr;
                }
        }
@@ -1566,7 +1558,7 @@ static void module_remove_modinfo_attrs(struct module *mod)
                /* pick a field to test for end of list */
                if (!attr->attr.name)
                        break;
-                sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+                sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
                if (attr->free)
                        attr->free(mod);
        }
@@ -1697,18 +1689,6 @@ static void mod_sysfs_teardown(struct module *mod)
        mod_sysfs_fini(mod);
 }
-/*
- * unlink the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __unlink_module(void *_mod)
-{
-        struct module *mod = _mod;
-        list_del(&mod->list);
-        module_bug_cleanup(mod);
-        return 0;
-}
 #ifdef CONFIG_DEBUG_SET_MODULE_RONX
 /*
 * LKM RO/NX protection: protect module's text/ro-data
@@ -1824,7 +1804,7 @@ static void unset_module_core_ro_nx(struct module *mod) { }
 static void unset_module_init_ro_nx(struct module *mod) { }
 #endif
-void __weak module_free(struct module *mod, void *module_region)
+void __weak module_memfree(void *module_region)
 {
        vfree(module_region);
 }
@@ -1833,6 +1813,10 @@ void __weak module_arch_cleanup(struct module *mod)
 {
 }
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1860,12 +1844,18 @@ static void free_module(struct module *mod)
        /* Now we can delete it from the lists */
        mutex_lock(&module_mutex);
-        stop_machine(__unlink_module, mod, NULL);
+        /* Unlink carefully: kallsyms could be walking list. */
+        list_del_rcu(&mod->list);
+        /* Remove this module from bug list, this uses list_del_rcu */
+        module_bug_cleanup(mod);
+        /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+        synchronize_rcu();
        mutex_unlock(&module_mutex);
        /* This may be NULL, but that's OK */
        unset_module_init_ro_nx(mod);
-        module_free(mod, mod->module_init);
+        module_arch_freeing_init(mod);
+        module_memfree(mod->module_init);
        kfree(mod->args);
        percpu_modfree(mod);
@@ -1874,7 +1864,7 @@ static void free_module(struct module *mod)
        /* Finally, free the core (containing the module structure) */
        unset_module_core_ro_nx(mod);
-        module_free(mod, mod->module_core);
+        module_memfree(mod->module_core);
 #ifdef CONFIG_MPU
        update_protections(current->mm);
@@ -1955,7 +1945,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                        /* We compiled with -fno-common.  These are not
                           supposed to happen.  */
                        pr_debug("Common symbol: %s\n", name);
-                        printk("%s: please compile with -fno-common\n",
+                        pr_warn("%s: please compile with -fno-common\n",
                               mod->name);
                        ret = -ENOEXEC;
                        break;
@@ -2259,7 +2249,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
 }
 static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
-                           unsigned int shnum)
+                        unsigned int shnum)
 {
        const Elf_Shdr *sec;
@@ -2735,7 +2725,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
                 * This shouldn't happen with same compiler and binutils
                 * building all parts of the module.
                 */
-                printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
+                pr_warn("%s: has both .ctors and .init_array.\n",
                       mod->name);
                return -EINVAL;
        }
@@ -2809,7 +2799,7 @@ static int move_module(struct module *mod, struct load_info *info)
                 */
                kmemleak_ignore(ptr);
                if (!ptr) {
-                        module_free(mod, mod->module_core);
+                        module_memfree(mod->module_core);
                        return -ENOMEM;
                }
                memset(ptr, 0, mod->init_size);
@@ -2954,8 +2944,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 static void module_deallocate(struct module *mod, struct load_info *info)
 {
        percpu_modfree(mod);
-        module_free(mod, mod->module_init);
+        module_arch_freeing_init(mod);
-        module_free(mod, mod->module_core);
+        module_memfree(mod->module_init);
+        module_memfree(mod->module_core);
 }
 int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3007,10 +2998,31 @@ static void do_mod_ctors(struct module *mod)
 #endif
 }
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+        struct rcu_head rcu;
+        void *module_init;
+};
+static void do_free_init(struct rcu_head *head)
+{
+        struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
+        module_memfree(m->module_init);
+        kfree(m);
+}
 /* This is where the real work happens */
 static int do_init_module(struct module *mod)
 {
        int ret = 0;
+        struct mod_initfree *freeinit;
+        freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+        if (!freeinit) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        freeinit->module_init = mod->module_init;
        /*
         * We want to find out whether @mod uses async during init.  Clear
@@ -3023,16 +3035,7 @@ static int do_init_module(struct module *mod)
        if (mod->init != NULL)
                ret = do_one_initcall(mod->init);
        if (ret < 0) {
-                /* Init routine failed: abort.  Try to protect us from
+                goto fail_free_freeinit;
-                   buggy refcounters. */
-                mod->state = MODULE_STATE_GOING;
-                synchronize_sched();
-                module_put(mod);
-                blocking_notifier_call_chain(&module_notify_list,
-                                             MODULE_STATE_GOING, mod);
-                free_module(mod);
-                wake_up_all(&module_wq);
-                return ret;
        }
        if (ret > 0) {
                pr_warn("%s: '%s'->init suspiciously returned %d, it should "
@@ -3077,15 +3080,35 @@ static int do_init_module(struct module *mod)
        mod->strtab = mod->core_strtab;
 #endif
        unset_module_init_ro_nx(mod);
-        module_free(mod, mod->module_init);
+        module_arch_freeing_init(mod);
        mod->module_init = NULL;
        mod->init_size = 0;
        mod->init_ro_size = 0;
        mod->init_text_size = 0;
+        /*
+         * We want to free module_init, but be aware that kallsyms may be
+         * walking this with preempt disabled.  In all the failure paths,
+         * we call synchronize_rcu/synchronize_sched, but we don't want
+         * to slow down the success path, so use actual RCU here.
+         */
+        call_rcu(&freeinit->rcu, do_free_init);
        mutex_unlock(&module_mutex);
        wake_up_all(&module_wq);
        return 0;
+fail_free_freeinit:
+        kfree(freeinit);
+fail:
+        /* Try to protect us from buggy refcounters. */
+        mod->state = MODULE_STATE_GOING;
+        synchronize_sched();
+        module_put(mod);
+        blocking_notifier_call_chain(&module_notify_list,
+                                     MODULE_STATE_GOING, mod);
+        free_module(mod);
+        wake_up_all(&module_wq);
+        return ret;
 }
 static int may_init_module(void)
@@ -3202,7 +3225,7 @@ out:
 static int unknown_module_param_cb(char *param, char *val, const char *modname)
 {
-        /* Check for magic 'dyndbg' arg */ 
+        /* Check for magic 'dyndbg' arg */
        int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
        if (ret != 0)
                pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
@@ -3352,6 +3375,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
        wake_up_all(&module_wq);
+        /* Wait for RCU synchronizing before releasing mod->list. */
+        synchronize_rcu();
        mutex_unlock(&module_mutex);
 free_module:
        module_deallocate(mod, info);
@@ -3685,8 +3710,8 @@ static int m_show(struct seq_file *m, void *p)
        /* Informative for users. */
        seq_printf(m, " %s",
-                   mod->state == MODULE_STATE_GOING ? "Unloading":
+                   mod->state == MODULE_STATE_GOING ? "Unloading" :
-                   mod->state == MODULE_STATE_COMING ? "Loading":
+                   mod->state == MODULE_STATE_COMING ? "Loading" :
                   "Live");
        /* Used by oprofile and other similar tools. */
        seq_printf(m, " 0x%pK", mod->module_core);
@@ -3695,7 +3720,7 @@ static int m_show(struct seq_file *m, void *p)
        if (mod->taints)
                seq_printf(m, " %s", module_flags(mod, buf));
-        seq_printf(m, "\n");
+        seq_puts(m, "\n");
        return 0;
 }
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0ab3115..49746c81ad8d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p)
 SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 {
-        const struct proc_ns_operations *ops;
        struct task_struct *tsk = current;
        struct nsproxy *new_nsproxy;
-        struct proc_ns *ei;
        struct file *file;
+        struct ns_common *ns;
        int err;
        file = proc_ns_fget(fd);
@@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
                return PTR_ERR(file);
        err = -EINVAL;
-        ei = get_proc_ns(file_inode(file));
+        ns = get_proc_ns(file_inode(file));
-        ops = ei->ns_ops;
+        if (nstype && (ns->ops->type != nstype))
-        if (nstype && (ops->type != nstype))
                goto out;
        new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
@@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
                goto out;
        }
-        err = ops->install(new_nsproxy, ei->ns);
+        err = ns->ops->install(new_nsproxy, ns);
        if (err) {
                free_nsproxy(new_nsproxy);
                goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index cf80672b7924..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 static bool crash_kexec_post_notifiers;
+int panic_on_warn __read_mostly;
 int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
@@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
        if (args)
                vprintk(args->fmt, args->args);
+        if (panic_on_warn) {
+                /*
+                 * This thread may hit another WARN() in the panic path.
+                 * Resetting this prevents additional WARN() from panicking the
+                 * system on this thread.  Other threads are blocked by the
+                 * panic_mutex in panic().
+                 */
+                panic_on_warn = 0;
+                panic("panic_on_warn set ...\n");
+        }
        print_modules();
        dump_stack();
        print_oops_end_marker();
@@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
+core_param(panic_on_warn, panic_on_warn, int, 0644);
 static int __init setup_crash_kexec_post_notifiers(char *s)
 {
diff --git a/kernel/params.c b/kernel/params.c
index db97b791390f..728e05b167de 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -603,74 +603,70 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
                                     const struct kernel_param *kp,
                                     const char *name)
 {
-        struct module_param_attrs *new;
+        struct module_param_attrs *new_mp;
-        struct attribute **attrs;
+        struct attribute **new_attrs;
-        int err, num;
+        unsigned int i;
        /* We don't bother calling this with invisible parameters. */
        BUG_ON(!kp->perm);
        if (!mk->mp) {
-                num = 0;
+                /* First allocation. */
-                attrs = NULL;
+                mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
-        } else {
+                if (!mk->mp)
-                num = mk->mp->num;
+                        return -ENOMEM;
-                attrs = mk->mp->grp.attrs;
+                mk->mp->grp.name = "parameters";
+                /* NULL-terminated attribute array. */
+                mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
+                                            GFP_KERNEL);
+                /* Caller will cleanup via free_module_param_attrs */
+                if (!mk->mp->grp.attrs)
+                        return -ENOMEM;
        }
-        /* Enlarge. */
+        /* Enlarge allocations. */
-        new = krealloc(mk->mp,
+        new_mp = krealloc(mk->mp,
-                       sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
+                          sizeof(*mk->mp) +
-                       GFP_KERNEL);
+                          sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
-        if (!new) {
+                          GFP_KERNEL);
-                kfree(attrs);
+        if (!new_mp)
-                err = -ENOMEM;
+                return -ENOMEM;
-                goto fail;
+        mk->mp = new_mp;
-        }
-        /* Despite looking like the typical realloc() bug, this is safe.
-         * We *want* the old 'attrs' to be freed either way, and we'll store
-         * the new one in the success case. */
-        attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
-        if (!attrs) {
-                err = -ENOMEM;
-                goto fail_free_new;
-        }
-        /* Sysfs wants everything zeroed. */
+        /* Extra pointer for NULL terminator */
-        memset(new, 0, sizeof(*new));
+        new_attrs = krealloc(mk->mp->grp.attrs,
-        memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
+                             sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
-        memset(&attrs[num], 0, sizeof(attrs[num]));
+                             GFP_KERNEL);
-        new->grp.name = "parameters";
+        if (!new_attrs)
-        new->grp.attrs = attrs;
+                return -ENOMEM;
+        mk->mp->grp.attrs = new_attrs;
        /* Tack new one on the end. */
-        sysfs_attr_init(&new->attrs[num].mattr.attr);
+        memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
-        new->attrs[num].param = kp;
+        sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
-        new->attrs[num].mattr.show = param_attr_show;
+        mk->mp->attrs[mk->mp->num].param = kp;
-        new->attrs[num].mattr.store = param_attr_store;
+        mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
-        new->attrs[num].mattr.attr.name = (char *)name;
+        /* Do not allow runtime DAC changes to make param writable. */
-        new->attrs[num].mattr.attr.mode = kp->perm;
+        if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
-        new->num = num+1;
+                mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+        else
+                mk->mp->attrs[mk->mp->num].mattr.store = NULL;
+        mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
+        mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
+        mk->mp->num++;
        /* Fix up all the pointers, since krealloc can move us */
-        for (num = 0; num < new->num; num++)
+        for (i = 0; i < mk->mp->num; i++)
-                new->grp.attrs[num] = &new->attrs[num].mattr.attr;
+                mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
-        new->grp.attrs[num] = NULL;
+        mk->mp->grp.attrs[mk->mp->num] = NULL;
-        mk->mp = new;
        return 0;
-fail_free_new:
-        kfree(new);
-fail:
-        mk->mp = NULL;
-        return err;
 }
 #ifdef CONFIG_MODULES
 static void free_module_param_attrs(struct module_kobject *mk)
 {
-        kfree(mk->mp->grp.attrs);
+        if (mk->mp)
+                kfree(mk->mp->grp.attrs);
        kfree(mk->mp);
        mk->mp = NULL;
 }
@@ -695,8 +691,10 @@ int module_param_sysfs_setup(struct module *mod,
                if (kparam[i].perm == 0)
                        continue;
                err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
-                if (err)
+                if (err) {
+                        free_module_param_attrs(&mod->mkobj);
                        return err;
+                }
                params = true;
        }
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..cd36a5e0d173 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = {
        .level = 0,
        .child_reaper = &init_task,
        .user_ns = &init_user_ns,
-        .proc_inum = PROC_PID_INIT_INO,
+        .ns.inum = PROC_PID_INIT_INO,
+#ifdef CONFIG_PID_NS
+        .ns.ops = &pidns_operations,
+#endif
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -341,6 +344,8 @@ out:
 out_unlock:
        spin_unlock_irq(&pidmap_lock);
+        put_pid_ns(ns);
 out_free:
        while (++i <= ns->level)
                free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..a65ba137fd15 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
        if (ns->pid_cachep == NULL)
                goto out_free_map;
-        err = proc_alloc_inum(&ns->proc_inum);
+        err = ns_alloc_inum(&ns->ns);
        if (err)
                goto out_free_map;
+        ns->ns.ops = &pidns_operations;
        kref_init(&ns->kref);
        ns->level = level;
@@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 {
        int i;
-        proc_free_inum(ns->proc_inum);
+        ns_free_inum(&ns->ns);
        for (i = 0; i < PIDMAP_ENTRIES; i++)
                kfree(ns->pidmap[i].page);
        put_user_ns(ns->user_ns);
@@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        /* Don't allow any more processes into the pid namespace */
        disable_pid_allocation(pid_ns);
-        /* Ignore SIGCHLD causing any terminated children to autoreap */
+        /*
+         * Ignore SIGCHLD causing any terminated children to autoreap.
+         * This speeds up the namespace shutdown, plus see the comment
+         * below.
+         */
        spin_lock_irq(&me->sighand->siglock);
        me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
        spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        }
        read_unlock(&tasklist_lock);
-        /* Firstly reap the EXIT_ZOMBIE children we may have. */
+        /*
+         * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
+         * sys_wait4() will also block until our children traced from the
+         * parent namespace are detached and become EXIT_DEAD.
+         */
        do {
                clear_thread_flag(TIF_SIGPENDING);
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
        /*
-         * sys_wait4() above can't reap the TASK_DEAD children.
+         * sys_wait4() above can't reap the EXIT_DEAD children but we do not
-         * Make sure they all go away, see free_pid().
+         * really care, we could reparent them to the global init. We could
+         * exit and reap ->child_reaper even if it is not the last thread in
+         * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+         * pid_ns can not go away until proc_kill_sb() drops the reference.
+         *
+         * But this ns can also have other tasks injected by setns()+fork().
+         * Again, ignoring the user visible semantics we do not really need
+         * to wait until they are all reaped, but they can be reparented to
+         * us and thus we need to ensure that pid->child_reaper stays valid
+         * until they all go away. See free_pid()->wake_up_process().
+         *
+         * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
+         * if reparented.
         */
        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
@@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
        return 0;
 }
-static void *pidns_get(struct task_struct *task)
+static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
+{
+        return container_of(ns, struct pid_namespace, ns);
+}
+static struct ns_common *pidns_get(struct task_struct *task)
 {
        struct pid_namespace *ns;
@@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task)
                get_pid_ns(ns);
        rcu_read_unlock();
-        return ns;
+        return ns ? &ns->ns : NULL;
 }
-static void pidns_put(void *ns)
+static void pidns_put(struct ns_common *ns)
 {
-        put_pid_ns(ns);
+        put_pid_ns(to_pid_ns(ns));
 }
-static int pidns_install(struct nsproxy *nsproxy, void *ns)
+static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 {
        struct pid_namespace *active = task_active_pid_ns(current);
-        struct pid_namespace *ancestor, *new = ns;
+        struct pid_namespace *ancestor, *new = to_pid_ns(ns);
        if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
        return 0;
 }
-static unsigned int pidns_inum(void *ns)
-{
-        struct pid_namespace *pid_ns = ns;
-        return pid_ns->proc_inum;
-}
 const struct proc_ns_operations pidns_operations = {
        .name           = "pid",
        .type           = CLONE_NEWPID,
        .get            = pidns_get,
        .put            = pidns_put,
        .install        = pidns_install,
-        .inum           = pidns_inum,
 };
 static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index bbef57f5bdfd..48b28d387c7f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
 config PM_SLEEP
        def_bool y
        depends on SUSPEND || HIBERNATE_CALLBACKS
+        select PM
 config PM_SLEEP_SMP
        def_bool y
@@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC
        depends on PM_WAKELOCKS
        default y
-config PM_RUNTIME
+config PM
-        bool "Run-time PM core functionality"
+        bool "Device power management core functionality"
-        depends on !IA64_HP_SIM
        ---help---
          Enable functionality allowing I/O devices to be put into energy-saving
-          (low power) states at run time (or autosuspended) after a specified
+          (low power) states, for example after a specified period of inactivity
-          period of inactivity and woken up in response to a hardware-generated
+          (autosuspended), and woken up in response to a hardware-generated
          wake-up event or a driver's request.
          Hardware support is generally required for this functionality to work
          and the bus type drivers of the buses the devices are on are
-          responsible for the actual handling of the autosuspend requests and
+          responsible for the actual handling of device suspend requests and
          wake-up events.
-config PM
-        def_bool y
-        depends on PM_SLEEP || PM_RUNTIME
 config PM_DEBUG
        bool "Power Management Debug Support"
        depends on PM
@@ -298,14 +294,9 @@ config PM_GENERIC_DOMAINS_SLEEP
        def_bool y
        depends on PM_SLEEP && PM_GENERIC_DOMAINS
-config PM_GENERIC_DOMAINS_RUNTIME
-        def_bool y
-        depends on PM_RUNTIME && PM_GENERIC_DOMAINS
 config PM_GENERIC_DOMAINS_OF
        def_bool y
        depends on PM_GENERIC_DOMAINS && OF
 config CPU_PM
        bool
-        depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f35a3478f3c..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/ktime.h>
 #include <trace/events/power.h>
 #include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
 * @nr_pages: Number of memory pages processed between @start and @stop.
 * @msg: Additional diagnostic message to print.
 */
-void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+void swsusp_show_speed(ktime_t start, ktime_t stop,
-                        unsigned nr_pages, char *msg)
+                      unsigned nr_pages, char *msg)
 {
+        ktime_t diff;
        u64 elapsed_centisecs64;
        unsigned int centisecs;
        unsigned int k;
        unsigned int kps;
-        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+        diff = ktime_sub(stop, start);
-        /*
+        elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
-         * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
-         * it is obvious enough for what went wrong.
-         */
-        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
        centisecs = elapsed_centisecs64;
        if (centisecs == 0)
                centisecs = 1;  /* avoid div-by-zero */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
 struct timeval;
 /* kernel/power/swsusp.c */
-extern void swsusp_show_speed(struct timeval *, struct timeval *,
+extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
-                                unsigned int, char *);
 #ifdef CONFIG_SUSPEND
 /* kernel/power/suspend.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 791a61892bb5..0c40c16174b4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/compiler.h>
+#include <linux/ktime.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1576,11 +1577,11 @@ int hibernate_preallocate_memory(void)
        struct zone *zone;
        unsigned long saveable, size, max_size, count, highmem, pages = 0;
        unsigned long alloc, save_highmem, pages_highmem, avail_normal;
-        struct timeval start, stop;
+        ktime_t start, stop;
        int error;
        printk(KERN_INFO "PM: Preallocating image memory... ");
-        do_gettimeofday(&start);
+        start = ktime_get();
        error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
        if (error)
@@ -1709,9 +1710,9 @@ int hibernate_preallocate_memory(void)
        free_unnecessary_pages();
 out:
-        do_gettimeofday(&stop);
+        stop = ktime_get();
        printk(KERN_CONT "done (allocated %lu pages)\n", pages);
-        swsusp_show_speed(&start, &stop, pages, "Allocated");
+        swsusp_show_speed(start, stop, pages, "Allocated");
        return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
 #include <linux/atomic.h>
 #include <linux/kthread.h>
 #include <linux/crc32.h>
+#include <linux/ktime.h>
 #include "power.h"
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
        int nr_pages;
        int err2;
        struct bio *bio;
-        struct timeval start;
+        ktime_t start;
-        struct timeval stop;
+        ktime_t stop;
        printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
                nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
                m = 1;
        nr_pages = 0;
        bio = NULL;
-        do_gettimeofday(&start);
+        start = ktime_get();
        while (1) {
                ret = snapshot_read_next(snapshot);
                if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
                nr_pages++;
        }
        err2 = hib_wait_on_bio_chain(&bio);
-        do_gettimeofday(&stop);
+        stop = ktime_get();
        if (!ret)
                ret = err2;
        if (!ret)
                printk(KERN_INFO "PM: Image saving done.\n");
-        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+        swsusp_show_speed(start, stop, nr_to_write, "Wrote");
        return ret;
 }
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
        int nr_pages;
        int err2;
        struct bio *bio;
-        struct timeval start;
+        ktime_t start;
-        struct timeval stop;
+        ktime_t stop;
        size_t off;
        unsigned thr, run_threads, nr_threads;
        unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
                m = 1;
        nr_pages = 0;
        bio = NULL;
-        do_gettimeofday(&start);
+        start = ktime_get();
        for (;;) {
                for (thr = 0; thr < nr_threads; thr++) {
                        for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
 out_finish:
        err2 = hib_wait_on_bio_chain(&bio);
-        do_gettimeofday(&stop);
+        stop = ktime_get();
        if (!ret)
                ret = err2;
        if (!ret)
                printk(KERN_INFO "PM: Image saving done.\n");
-        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+        swsusp_show_speed(start, stop, nr_to_write, "Wrote");
 out_clean:
        if (crc) {
                if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
 {
        unsigned int m;
        int ret = 0;
-        struct timeval start;
+        ktime_t start;
-        struct timeval stop;
+        ktime_t stop;
        struct bio *bio;
        int err2;
        unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
                m = 1;
        nr_pages = 0;
        bio = NULL;
-        do_gettimeofday(&start);
+        start = ktime_get();
        for ( ; ; ) {
                ret = snapshot_write_next(snapshot);
                if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
                nr_pages++;
        }
        err2 = hib_wait_on_bio_chain(&bio);
-        do_gettimeofday(&stop);
+        stop = ktime_get();
        if (!ret)
                ret = err2;
        if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
                if (!snapshot_image_loaded(snapshot))
                        ret = -ENODATA;
        }
-        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+        swsusp_show_speed(start, stop, nr_to_read, "Read");
        return ret;
 }
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
        int ret = 0;
        int eof = 0;
        struct bio *bio;
-        struct timeval start;
+        ktime_t start;
-        struct timeval stop;
+        ktime_t stop;
        unsigned nr_pages;
        size_t off;
        unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
                m = 1;
        nr_pages = 0;
        bio = NULL;
-        do_gettimeofday(&start);
+        start = ktime_get();
        ret = snapshot_write_next(snapshot);
        if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
                wait_event(crc->done, atomic_read(&crc->stop));
                atomic_set(&crc->stop, 0);
        }
-        do_gettimeofday(&stop);
+        stop = ktime_get();
        if (!ret) {
                printk(KERN_INFO "PM: Image loading done.\n");
                snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
                        }
                }
        }
-        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+        swsusp_show_speed(start, stop, nr_to_read, "Read");
 out_clean:
        for (i = 0; i < ring_size; i++)
                free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
                                kthread_stop(data[thr].thr);
                vfree(data);
        }
-        if (page) vfree(page);
+        vfree(page);
        return ret;
 }
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ced2b84b1cb7..02d6b6d28796 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
        CONSOLE_LOGLEVEL_DEFAULT,       /* default_console_loglevel */
 };
-/* Deferred messaged from sched code are marked by this special level */
-#define SCHED_MESSAGE_LOGLEVEL -2
 /*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
@@ -480,7 +477,7 @@ static int syslog_action_restricted(int type)
               type != SYSLOG_ACTION_SIZE_BUFFER;
 }
-static int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, bool from_file)
 {
        /*
         * If this is from /proc/kmsg and we've already opened it, then we've
@@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
        bool clear = false;
-        static int saved_console_loglevel = -1;
+        static int saved_console_loglevel = LOGLEVEL_DEFAULT;
        int error;
        error = check_syslog_permissions(type, from_file);
@@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
-                if (saved_console_loglevel == -1)
+                if (saved_console_loglevel == LOGLEVEL_DEFAULT)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
        /* Enable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:
-                if (saved_console_loglevel != -1) {
+                if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
                        console_loglevel = saved_console_loglevel;
-                        saved_console_loglevel = -1;
+                        saved_console_loglevel = LOGLEVEL_DEFAULT;
                }
                break;
        /* Set level of messages printed to console */
@@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        len = minimum_console_loglevel;
                console_loglevel = len;
                /* Implicitly re-enable logging to console */
-                saved_console_loglevel = -1;
+                saved_console_loglevel = LOGLEVEL_DEFAULT;
                error = 0;
                break;
        /* Number of chars in the log buffer */
@@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
        int printed_len = 0;
        bool in_sched = false;
        /* cpu currently holding logbuf_lock in this function */
-        static volatile unsigned int logbuf_cpu = UINT_MAX;
+        static unsigned int logbuf_cpu = UINT_MAX;
-        if (level == SCHED_MESSAGE_LOGLEVEL) {
+        if (level == LOGLEVEL_SCHED) {
-                level = -1;
+                level = LOGLEVEL_DEFAULT;
                in_sched = true;
        }
@@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
                        const char *end_of_header = printk_skip_level(text);
                        switch (kern_level) {
                        case '0' ... '7':
-                                if (level == -1)
+                                if (level == LOGLEVEL_DEFAULT)
                                        level = kern_level - '0';
+                                /* fallthrough */
                        case 'd':       /* KERN_DEFAULT */
                                lflags |= LOG_PREFIX;
                        }
@@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                }
        }
-        if (level == -1)
+        if (level == LOGLEVEL_DEFAULT)
                level = default_message_loglevel;
        if (dict)
@@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
-        return vprintk_emit(0, -1, NULL, 0, fmt, args);
+        return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
 }
 EXPORT_SYMBOL(vprintk);
@@ -1807,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
+int vprintk_default(const char *fmt, va_list args)
+{
+        int r;
+#ifdef CONFIG_KGDB_KDB
+        if (unlikely(kdb_trap_printk)) {
+                r = vkdb_printf(fmt, args);
+                return r;
+        }
+#endif
+        r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+        return r;
+}
+EXPORT_SYMBOL_GPL(vprintk_default);
+/*
+ * This allows printk to be diverted to another function per cpu.
+ * This is useful for calling printk functions from within NMI
+ * without worrying about race conditions that can lock up the
+ * box.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 /**
 * printk - print a kernel message
 * @fmt: format string
@@ -1830,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit);
 */
 asmlinkage __visible int printk(const char *fmt, ...)
 {
+        printk_func_t vprintk_func;
        va_list args;
        int r;
-#ifdef CONFIG_KGDB_KDB
-        if (unlikely(kdb_trap_printk)) {
-                va_start(args, fmt);
-                r = vkdb_printf(fmt, args);
-                va_end(args);
-                return r;
-        }
-#endif
        va_start(args, fmt);
-        r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+        /*
+         * If a caller overrides the per_cpu printk_func, then it needs
+         * to disable preemption when calling printk(). Otherwise
+         * the printk_func should be set to the default. No need to
+         * disable preemption here.
+         */
+        vprintk_func = this_cpu_read(printk_func);
+        r = vprintk_func(fmt, args);
        va_end(args);
        return r;
@@ -1876,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
                             bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
+/* Still needs to be defined for users */
+DEFINE_PER_CPU(printk_func_t, printk_func);
 #endif /* CONFIG_PRINTK */
 #ifdef CONFIG_EARLY_PRINTK
 struct console *early_console;
-void early_vprintk(const char *fmt, va_list ap)
-{
-        if (early_console) {
-                char buf[512];
-                int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-                early_console->write(early_console, buf, n);
-        }
-}
 asmlinkage __visible void early_printk(const char *fmt, ...)
 {
        va_list ap;
+        char buf[512];
+        int n;
+        if (!early_console)
+                return;
        va_start(ap, fmt);
-        early_vprintk(fmt, ap);
+        n = vscnprintf(buf, sizeof(buf), fmt, ap);
        va_end(ap);
+        early_console->write(early_console, buf, n);
 }
 #endif
@@ -2634,7 +2658,7 @@ int printk_deferred(const char *fmt, ...)
        preempt_disable();
        va_start(args, fmt);
-        r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
+        r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
        va_end(args);
        __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 /*
 * Detach all tasks we were using ptrace on. Called with tasklist held
- * for writing, and returns with it held too. But note it can release
+ * for writing.
- * and reacquire the lock.
 */
-void exit_ptrace(struct task_struct *tracer)
+void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
-        __releases(&tasklist_lock)
-        __acquires(&tasklist_lock)
 {
        struct task_struct *p, *n;
-        LIST_HEAD(ptrace_dead);
-        if (likely(list_empty(&tracer->ptraced)))
-                return;
        list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
                if (unlikely(p->ptrace & PT_EXITKILL))
                        send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
                if (__ptrace_detach(tracer, p))
-                        list_add(&p->ptrace_entry, &ptrace_dead);
+                        list_add(&p->ptrace_entry, dead);
-        }
-        write_unlock_irq(&tasklist_lock);
-        BUG_ON(!list_empty(&tracer->ptraced));
-        list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
-                list_del_init(&p->ptrace_entry);
-                release_task(p);
        }
-        write_lock_irq(&tasklist_lock);
 }
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/range.c b/kernel/range.c
index 322ea8e93e4b..82cfc285b046 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2)
 {
        const struct range *r1 = x1;
        const struct range *r2 = x2;
-        s64 start1, start2;
-        start1 = r1->start;
+        if (r1->start < r2->start)
-        start2 = r2->start;
+                return -1;
+        if (r1->start > r2->start)
-        return start1 - start2;
+                return 1;
+        return 0;
 }
 int clean_sort_range(struct range *range, int az)
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * resource cgroups
- *
- * Copyright 2007 OpenVZ SWsoft Inc
- *
- * Author: Pavel Emelianov <xemul@openvz.org>
- *
- */
-#include <linux/types.h>
-#include <linux/parser.h>
-#include <linux/fs.h>
-#include <linux/res_counter.h>
-#include <linux/uaccess.h>
-#include <linux/mm.h>
-void res_counter_init(struct res_counter *counter, struct res_counter *parent)
-{
-        spin_lock_init(&counter->lock);
-        counter->limit = RES_COUNTER_MAX;
-        counter->soft_limit = RES_COUNTER_MAX;
-        counter->parent = parent;
-}
-static u64 res_counter_uncharge_locked(struct res_counter *counter,
-                                       unsigned long val)
-{
-        if (WARN_ON(counter->usage < val))
-                val = counter->usage;
-        counter->usage -= val;
-        return counter->usage;
-}
-static int res_counter_charge_locked(struct res_counter *counter,
-                                     unsigned long val, bool force)
-{
-        int ret = 0;
-        if (counter->usage + val > counter->limit) {
-                counter->failcnt++;
-                ret = -ENOMEM;
-                if (!force)
-                        return ret;
-        }
-        counter->usage += val;
-        if (counter->usage > counter->max_usage)
-                counter->max_usage = counter->usage;
-        return ret;
-}
-static int __res_counter_charge(struct res_counter *counter, unsigned long val,
-                                struct res_counter **limit_fail_at, bool force)
-{
-        int ret, r;
-        unsigned long flags;
-        struct res_counter *c, *u;
-        r = ret = 0;
-        *limit_fail_at = NULL;
-        local_irq_save(flags);
-        for (c = counter; c != NULL; c = c->parent) {
-                spin_lock(&c->lock);
-                r = res_counter_charge_locked(c, val, force);
-                spin_unlock(&c->lock);
-                if (r < 0 && !ret) {
-                        ret = r;
-                        *limit_fail_at = c;
-                        if (!force)
-                                break;
-                }
-        }
-        if (ret < 0 && !force) {
-                for (u = counter; u != c; u = u->parent) {
-                        spin_lock(&u->lock);
-                        res_counter_uncharge_locked(u, val);
-                        spin_unlock(&u->lock);
-                }
-        }
-        local_irq_restore(flags);
-        return ret;
-}
-int res_counter_charge(struct res_counter *counter, unsigned long val,
-                        struct res_counter **limit_fail_at)
-{
-        return __res_counter_charge(counter, val, limit_fail_at, false);
-}
-int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
-                              struct res_counter **limit_fail_at)
-{
-        return __res_counter_charge(counter, val, limit_fail_at, true);
-}
-u64 res_counter_uncharge_until(struct res_counter *counter,
-                               struct res_counter *top,
-                               unsigned long val)
-{
-        unsigned long flags;
-        struct res_counter *c;
-        u64 ret = 0;
-        local_irq_save(flags);
-        for (c = counter; c != top; c = c->parent) {
-                u64 r;
-                spin_lock(&c->lock);
-                r = res_counter_uncharge_locked(c, val);
-                if (c == counter)
-                        ret = r;
-                spin_unlock(&c->lock);
-        }
-        local_irq_restore(flags);
-        return ret;
-}
-u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
-{
-        return res_counter_uncharge_until(counter, NULL, val);
-}
-static inline unsigned long long *
-res_counter_member(struct res_counter *counter, int member)
-{
-        switch (member) {
-        case RES_USAGE:
-                return &counter->usage;
-        case RES_MAX_USAGE:
-                return &counter->max_usage;
-        case RES_LIMIT:
-                return &counter->limit;
-        case RES_FAILCNT:
-                return &counter->failcnt;
-        case RES_SOFT_LIMIT:
-                return &counter->soft_limit;
-        };
-        BUG();
-        return NULL;
-}
-ssize_t res_counter_read(struct res_counter *counter, int member,
-                const char __user *userbuf, size_t nbytes, loff_t *pos,
-                int (*read_strategy)(unsigned long long val, char *st_buf))
-{
-        unsigned long long *val;
-        char buf[64], *s;
-        s = buf;
-        val = res_counter_member(counter, member);
-        if (read_strategy)
-                s += read_strategy(*val, s);
-        else
-                s += sprintf(s, "%llu\n", *val);
-        return simple_read_from_buffer((void __user *)userbuf, nbytes,
-                        pos, buf, s - buf);
-}
-#if BITS_PER_LONG == 32
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
-        unsigned long flags;
-        u64 ret;
-        spin_lock_irqsave(&counter->lock, flags);
-        ret = *res_counter_member(counter, member);
-        spin_unlock_irqrestore(&counter->lock, flags);
-        return ret;
-}
-#else
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
-        return *res_counter_member(counter, member);
-}
-#endif
-int res_counter_memparse_write_strategy(const char *buf,
-                                        unsigned long long *resp)
-{
-        char *end;
-        unsigned long long res;
-        /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
-        if (*buf == '-') {
-                int rc = kstrtoull(buf + 1, 10, &res);
-                if (rc)
-                        return rc;
-                if (res != 1)
-                        return -EINVAL;
-                *resp = RES_COUNTER_MAX;
-                return 0;
-        }
-        res = memparse(buf, &end);
-        if (*end != '\0')
-                return -EINVAL;
-        if (PAGE_ALIGN(res) >= res)
-                res = PAGE_ALIGN(res);
-        else
-                res = RES_COUNTER_MAX;
-        *resp = res;
-        return 0;
-}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bb398c0c5f08..c0accc00566e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4527,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
 #endif
+        ppid = 0;
        rcu_read_lock();
-        ppid = task_pid_nr(rcu_dereference(p->real_parent));
+        if (pid_alive(p))
+                ppid = task_pid_nr(rcu_dereference(p->real_parent));
        rcu_read_unlock();
        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
                task_pid_nr(p), ppid,
@@ -7111,9 +7113,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
-        alloc_size += num_possible_cpus() * cpumask_size();
-#endif
        if (alloc_size) {
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -7133,13 +7132,13 @@ void __init sched_init(void)
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
+        }
 #ifdef CONFIG_CPUMASK_OFFSTACK
-                for_each_possible_cpu(i) {
+        for_each_possible_cpu(i) {
-                        per_cpu(load_balance_mask, i) = (void *)ptr;
+                per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
-                        ptr += cpumask_size();
+                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-                }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
        }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
        init_rt_bandwidth(&def_rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e5db8c6feebd..b52092f2636d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -570,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
 static
 int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
 {
-        int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
+        return (dl_se->runtime <= 0);
-        int rorun = dl_se->runtime <= 0;
-        if (!rorun && !dmiss)
-                return 0;
-        /*
-         * If we are beyond our current deadline and we are still
-         * executing, then we have already used some of the runtime of
-         * the next instance. Thus, if we do not account that, we are
-         * stealing bandwidth from the system at each deadline miss!
-         */
-        if (dmiss) {
-                dl_se->runtime = rorun ? dl_se->runtime : 0;
-                dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
-        }
-        return 1;
 }
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -826,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
         * parameters of the task might need updating. Otherwise,
         * we want a replenishment of its runtime.
         */
-        if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
+        if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
-                replenish_dl_entity(dl_se, pi_se);
-        else
                update_dl_entity(dl_se, pi_se);
+        else if (flags & ENQUEUE_REPLENISH)
+                replenish_dl_entity(dl_se, pi_se);
        __enqueue_dl_entity(dl_se);
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df2cdf77f899..40667cbf371b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4005,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
+        /* init_cfs_bandwidth() was not called */
+        if (!cfs_b->throttled_cfs_rq.next)
+                return;
        hrtimer_cancel(&cfs_b->period_timer);
        hrtimer_cancel(&cfs_b->slack_timer);
 }
@@ -4424,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                 * wl = S * s'_i; see (2)
                 */
                if (W > 0 && w < W)
-                        wl = (w * tg->shares) / W;
+                        wl = (w * (long)tg->shares) / W;
                else
                        wl = tg->shares;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a82..b6e4c16377c7 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
 }
 EXPORT_SYMBOL_GPL(print_stack_trace);
+int snprint_stack_trace(char *buf, size_t size,
+                        struct stack_trace *trace, int spaces)
+{
+        int i;
+        unsigned long ip;
+        int generated;
+        int total = 0;
+        if (WARN_ON(!trace->entries))
+                return 0;
+        for (i = 0; i < trace->nr_entries; i++) {
+                ip = trace->entries[i];
+                generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
+                                1 + spaces, ' ', (void *) ip, (void *) ip);
+                total += generated;
+                /* Assume that generated isn't a negative number */
+                if (generated >= size) {
+                        buf += size;
+                        size = 0;
+                } else {
+                        buf += generated;
+                        size -= generated;
+                }
+        }
+        return total;
+}
+EXPORT_SYMBOL_GPL(snprint_stack_trace);
 /*
 * Architectures that do not implement save_stack_trace_tsk or
 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 02aa4185b17e..5adcb0ae3a58 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -169,6 +169,8 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 cond_syscall(sys_subpage_prot);
+cond_syscall(sys_s390_pci_mmio_read);
+cond_syscall(sys_s390_pci_mmio_write);
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
@@ -224,3 +226,6 @@ cond_syscall(sys_seccomp);
 /* access BPF programs and maps */
 cond_syscall(sys_bpf);
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 15f2511a1b7c..137c7f69b264 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -623,6 +623,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+        {
+                .procname       = "tracepoint_printk",
+                .data           = &tracepoint_printk,
+                .maxlen         = sizeof(tracepoint_printk),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
 #endif
 #ifdef CONFIG_KEXEC
        {
@@ -1104,6 +1111,15 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
+        {
+                .procname       = "panic_on_warn",
+                .data           = &panic_on_warn,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
        { }
 };
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 9a4f750a2963..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
        { CTL_INT,      KERN_COMPAT_LOG,                "compat-log" },
        { CTL_INT,      KERN_MAX_LOCK_DEPTH,            "max_lock_depth" },
        { CTL_INT,      KERN_PANIC_ON_NMI,              "panic_on_unrecovered_nmi" },
+        { CTL_INT,      KERN_PANIC_ON_WARN,             "panic_on_warn" },
        {}
 };
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b312fcc73024..670fff88a961 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        stats = nla_data(na);
        memset(stats, 0, sizeof(*stats));
-        rc = cgroupstats_build(stats, f.file->f_dentry);
+        rc = cgroupstats_build(stats, f.file->f_path.dentry);
        if (rc < 0) {
                nlmsg_free(rep_skb);
                goto err;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2e949cc9c9f1..b79f39bda7e1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
        /* Initialize mult/shift and max_idle_ns */
        __clocksource_updatefreq_scale(cs, scale, freq);
-        /* Add clocksource to the clcoksource list */
+        /* Add clocksource to the clocksource list */
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
        clocksource_enqueue_watchdog(cs);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ff3ec34702e8..1363d58f07e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -235,7 +235,7 @@ void tick_nohz_full_kick(void)
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;
-        irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+        irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
 }
 /*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 887e7d505974..2c85b7724af4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -745,6 +745,7 @@ u64 nsecs_to_jiffies64(u64 n)
        return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
 #endif
 }
+EXPORT_SYMBOL(nsecs_to_jiffies64);
 /**
 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 67d6369ddf83..979ccde26720 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
-ifeq ($(CONFIG_PM_RUNTIME),y)
+ifeq ($(CONFIG_PM),y)
 obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
 endif
 ifeq ($(CONFIG_TRACING),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..483cecfa5c17 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
        r->sector_from = be64_to_cpu(sector_from);
 }
-typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
-static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
        char rwbs[RWBS_LEN];
        unsigned long long ts  = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
        fill_rwbs(rwbs, t);
-        return trace_seq_printf(&iter->seq,
+        trace_seq_printf(&iter->seq,
-                                "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
+                         "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
-                                MAJOR(t->device), MINOR(t->device), iter->cpu,
+                         MAJOR(t->device), MINOR(t->device), iter->cpu,
-                                secs, nsec_rem, iter->ent->pid, act, rwbs);
+                         secs, nsec_rem, iter->ent->pid, act, rwbs);
 }
-static int blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act)
 {
        char rwbs[RWBS_LEN];
        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
        fill_rwbs(rwbs, t);
-        return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+        trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
-                                MAJOR(t->device), MINOR(t->device), act, rwbs);
+                         MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
-static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 {
        const unsigned char *pdu_buf;
        int pdu_len;
-        int i, end, ret;
+        int i, end;
        pdu_buf = pdu_start(ent);
        pdu_len = te_blk_io_trace(ent)->pdu_len;
        if (!pdu_len)
-                return 1;
+                return;
        /* find the last zero that needs to be printed */
        for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
                        break;
        end++;
-        if (!trace_seq_putc(s, '('))
+        trace_seq_putc(s, '(');
-                return 0;
        for (i = 0; i < pdu_len; i++) {
-                ret = trace_seq_printf(s, "%s%02x",
+                trace_seq_printf(s, "%s%02x",
-                                       i == 0 ? "" : " ", pdu_buf[i]);
+                                 i == 0 ? "" : " ", pdu_buf[i]);
-                if (!ret)
-                        return ret;
                /*
                 * stop when the rest is just zeroes and indicate so
                 * with a ".." appended
                 */
-                if (i == end && end != pdu_len - 1)
+                if (i == end && end != pdu_len - 1) {
-                        return trace_seq_puts(s, " ..) ");
+                        trace_seq_puts(s, " ..) ");
+                        return;
+                }
        }
-        return trace_seq_puts(s, ") ");
+        trace_seq_puts(s, ") ");
 }
-static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
        char cmd[TASK_COMM_LEN];
        trace_find_cmdline(ent->pid, cmd);
        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
-                int ret;
+                trace_seq_printf(s, "%u ", t_bytes(ent));
+                blk_log_dump_pdu(s, ent);
-                ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+                trace_seq_printf(s, "[%s]\n", cmd);
-                if (!ret)
-                        return 0;
-                ret = blk_log_dump_pdu(s, ent);
-                if (!ret)
-                        return 0;
-                return trace_seq_printf(s, "[%s]\n", cmd);
        } else {
                if (t_sec(ent))
-                        return trace_seq_printf(s, "%llu + %u [%s]\n",
+                        trace_seq_printf(s, "%llu + %u [%s]\n",
                                                t_sector(ent), t_sec(ent), cmd);
-                return trace_seq_printf(s, "[%s]\n", cmd);
+                else
+                        trace_seq_printf(s, "[%s]\n", cmd);
        }
 }
-static int blk_log_with_error(struct trace_seq *s,
+static void blk_log_with_error(struct trace_seq *s,
                              const struct trace_entry *ent)
 {
        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
-                int ret;
+                blk_log_dump_pdu(s, ent);
+                trace_seq_printf(s, "[%d]\n", t_error(ent));
-                ret = blk_log_dump_pdu(s, ent);
-                if (ret)
-                        return trace_seq_printf(s, "[%d]\n", t_error(ent));
-                return 0;
        } else {
                if (t_sec(ent))
-                        return trace_seq_printf(s, "%llu + %u [%d]\n",
+                        trace_seq_printf(s, "%llu + %u [%d]\n",
-                                                t_sector(ent),
+                                         t_sector(ent),
-                                                t_sec(ent), t_error(ent));
+                                         t_sec(ent), t_error(ent));
-                return trace_seq_printf(s, "%llu [%d]\n",
+                else
-                                        t_sector(ent), t_error(ent));
+                        trace_seq_printf(s, "%llu [%d]\n",
+                                         t_sector(ent), t_error(ent));
        }
 }
-static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
        struct blk_io_trace_remap r = { .device_from = 0, };
        get_pdu_remap(ent, &r);
-        return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+        trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-                                t_sector(ent), t_sec(ent),
+                         t_sector(ent), t_sec(ent),
-                                MAJOR(r.device_from), MINOR(r.device_from),
+                         MAJOR(r.device_from), MINOR(r.device_from),
-                                (unsigned long long)r.sector_from);
+                         (unsigned long long)r.sector_from);
 }
-static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 {
        char cmd[TASK_COMM_LEN];
        trace_find_cmdline(ent->pid, cmd);
-        return trace_seq_printf(s, "[%s]\n", cmd);
+        trace_seq_printf(s, "[%s]\n", cmd);
 }
-static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
 {
        char cmd[TASK_COMM_LEN];
        trace_find_cmdline(ent->pid, cmd);
-        return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+        trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
 }
-static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 {
        char cmd[TASK_COMM_LEN];
        trace_find_cmdline(ent->pid, cmd);
-        return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+        trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-                                get_pdu_int(ent), cmd);
+                         get_pdu_int(ent), cmd);
 }
-static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
 {
-        int ret;
        const struct blk_io_trace *t = te_blk_io_trace(ent);
-        ret = trace_seq_putmem(s, t + 1, t->pdu_len);
+        trace_seq_putmem(s, t + 1, t->pdu_len);
-        if (ret)
+        trace_seq_putc(s, '\n');
-                return trace_seq_putc(s, '\n');
-        return ret;
 }
 /*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
 static const struct {
        const char *act[2];
-        int        (*print)(struct trace_seq *s, const struct trace_entry *ent);
+        void       (*print)(struct trace_seq *s, const struct trace_entry *ent);
 } what2act[] = {
        [__BLK_TA_QUEUE]        = {{  "Q", "queue" },      blk_log_generic },
        [__BLK_TA_BACKMERGE]    = {{  "M", "backmerge" },  blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
        struct trace_seq *s = &iter->seq;
        const struct blk_io_trace *t;
        u16 what;
-        int ret;
        bool long_act;
        blk_log_action_t *log_action;
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
        log_action = classic ? &blk_log_action_classic : &blk_log_action;
        if (t->action == BLK_TN_MESSAGE) {
-                ret = log_action(iter, long_act ? "message" : "m");
+                log_action(iter, long_act ? "message" : "m");
-                if (ret)
+                blk_log_msg(s, iter->ent);
-                        ret = blk_log_msg(s, iter->ent);
-                goto out;
        }
        if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-                ret = trace_seq_printf(s, "Unknown action %x\n", what);
+                trace_seq_printf(s, "Unknown action %x\n", what);
        else {
-                ret = log_action(iter, what2act[what].act[long_act]);
+                log_action(iter, what2act[what].act[long_act]);
-                if (ret)
+                what2act[what].print(s, iter->ent);
-                        ret = what2act[what].print(s, iter->ent);
        }
-out:
-        return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+        return trace_handle_return(s);
 }
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
        return print_one_line(iter, false);
 }
-static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 {
        struct trace_seq *s = &iter->seq;
        struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
                .time     = iter->ts,
        };
-        if (!trace_seq_putmem(s, &old, offset))
+        trace_seq_putmem(s, &old, offset);
-                return 0;
+        trace_seq_putmem(s, &t->sector,
-        return trace_seq_putmem(s, &t->sector,
+                         sizeof(old) - offset + t->pdu_len);
-                                sizeof(old) - offset + t->pdu_len);
 }
 static enum print_line_t
 blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
                             struct trace_event *event)
 {
-        return blk_trace_synthesize_old_trace(iter) ?
+        blk_trace_synthesize_old_trace(iter);
-                        TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+        return trace_handle_return(&iter->seq);
 }
 static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
@@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
        if (atomic_dec_and_test(&blk_probes_ref))
                blk_unregister_tracepoints();
-        spin_lock_irq(&running_trace_lock);
-        list_del(&bt->running_list);
-        spin_unlock_irq(&running_trace_lock);
        blk_trace_free(bt);
        return 0;
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 31c90fec4158..224e768bdc73 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -387,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
        return ret;
 }
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
        if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -416,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
                if (control_ops_alloc(ops))
                        return -ENOMEM;
                add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
+                /* The control_ops needs the trampoline update */
+                ops = &control_ops;
        } else
                add_ftrace_ops(&ftrace_ops_list, ops);
+        ftrace_update_trampoline(ops);
        if (ftrace_enabled)
                update_ftrace_function();
@@ -565,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2)
 static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-        seq_printf(m, "  Function                               "
+        seq_puts(m, "  Function                               "
-                   "Hit    Time            Avg             s^2\n"
+                 "Hit    Time            Avg             s^2\n"
-                      "  --------                               "
+                    "  --------                               "
-                   "---    ----            ---             ---\n");
+                 "---    ----            ---             ---\n");
 #else
-        seq_printf(m, "  Function                               Hit\n"
+        seq_puts(m, "  Function                               Hit\n"
-                      "  --------                               ---\n");
+                    "  --------                               ---\n");
 #endif
        return 0;
 }
@@ -598,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v)
        seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-        seq_printf(m, "    ");
+        seq_puts(m, "    ");
        avg = rec->time;
        do_div(avg, rec->counter);
@@ -1111,6 +1117,43 @@ static struct ftrace_ops global_ops = {
                                          FTRACE_OPS_FL_INITIALIZED,
 };
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+        struct ftrace_ops *op;
+        bool ret = false;
+        /*
+         * Some of the ops may be dynamically allocated,
+         * they are freed after a synchronize_sched().
+         */
+        preempt_disable_notrace();
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                /*
+                 * This is to check for dynamically allocated trampolines.
+                 * Trampolines that are in kernel text will have
+                 * core_kernel_text() return true.
+                 */
+                if (op->trampoline && op->trampoline_size)
+                        if (addr >= op->trampoline &&
+                            addr < op->trampoline + op->trampoline_size) {
+                                ret = true;
+                                goto out;
+                        }
+        } while_for_each_ftrace_op(op);
+ out:
+        preempt_enable_notrace();
+        return ret;
+}
 struct ftrace_page {
        struct ftrace_page      *next;
        struct dyn_ftrace       *records;
@@ -1315,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
 static void
 ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+                                       struct ftrace_hash *new_hash);
 static int
 ftrace_hash_move(struct ftrace_ops *ops, int enable,
                 struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1325,8 +1371,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        struct ftrace_hash *new_hash;
        int size = src->count;
        int bits = 0;
+        int ret;
        int i;
+        /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+        if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+                return -EINVAL;
        /*
         * If the new source is empty, just free dst and assign it
         * the empty_hash.
@@ -1360,6 +1411,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        }
 update:
+        /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
+        if (enable) {
+                /* IPMODIFY should be updated only when filter_hash updating */
+                ret = ftrace_hash_ipmodify_update(ops, new_hash);
+                if (ret < 0) {
+                        free_ftrace_hash(new_hash);
+                        return ret;
+                }
+        }
        /*
         * Remove the current set, update the hash and add
         * them back.
@@ -1724,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
        ftrace_hash_rec_update_modify(ops, filter_hash, 1);
 }
+/*
+ * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
+ * or no-needed to update, -EBUSY if it detects a conflict of the flag
+ * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
+ * Note that old_hash and new_hash has below meanings
+ *  - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
+ *  - If the hash is EMPTY_HASH, it hits nothing
+ *  - Anything else hits the recs which match the hash entries.
+ */
+static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
+                                         struct ftrace_hash *old_hash,
+                                         struct ftrace_hash *new_hash)
+{
+        struct ftrace_page *pg;
+        struct dyn_ftrace *rec, *end = NULL;
+        int in_old, in_new;
+        /* Only update if the ops has been registered */
+        if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+                return 0;
+        if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+                return 0;
+        /*
+         * Since the IPMODIFY is a very address sensitive action, we do not
+         * allow ftrace_ops to set all functions to new hash.
+         */
+        if (!new_hash || !old_hash)
+                return -EINVAL;
+        /* Update rec->flags */
+        do_for_each_ftrace_rec(pg, rec) {
+                /* We need to update only differences of filter_hash */
+                in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+                in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+                if (in_old == in_new)
+                        continue;
+                if (in_new) {
+                        /* New entries must ensure no others are using it */
+                        if (rec->flags & FTRACE_FL_IPMODIFY)
+                                goto rollback;
+                        rec->flags |= FTRACE_FL_IPMODIFY;
+                } else /* Removed entry */
+                        rec->flags &= ~FTRACE_FL_IPMODIFY;
+        } while_for_each_ftrace_rec();
+        return 0;
+rollback:
+        end = rec;
+        /* Roll back what we did above */
+        do_for_each_ftrace_rec(pg, rec) {
+                if (rec == end)
+                        goto err_out;
+                in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+                in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+                if (in_old == in_new)
+                        continue;
+                if (in_new)
+                        rec->flags &= ~FTRACE_FL_IPMODIFY;
+                else
+                        rec->flags |= FTRACE_FL_IPMODIFY;
+        } while_for_each_ftrace_rec();
+err_out:
+        return -EBUSY;
+}
+static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
+{
+        struct ftrace_hash *hash = ops->func_hash->filter_hash;
+        if (ftrace_hash_empty(hash))
+                hash = NULL;
+        return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+}
+/* Disabling always succeeds */
+static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
+{
+        struct ftrace_hash *hash = ops->func_hash->filter_hash;
+        if (ftrace_hash_empty(hash))
+                hash = NULL;
+        __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+}
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+                                       struct ftrace_hash *new_hash)
+{
+        struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
+        if (ftrace_hash_empty(old_hash))
+                old_hash = NULL;
+        if (ftrace_hash_empty(new_hash))
+                new_hash = NULL;
+        return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+}
 static void print_ip_ins(const char *fmt, unsigned char *p)
 {
        int i;
@@ -1734,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
                printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
 }
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
 /**
 * ftrace_bug - report and shutdown function tracer
 * @failed: The failed type (EFAULT, EINVAL, EPERM)
- * @ip: The address that failed
+ * @rec: The record that failed
 *
 * The arch code that enables or disables the function tracing
 * can call ftrace_bug() when it has detected a problem in
@@ -1746,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
 * EINVAL - if what is read at @ip is not what was expected
 * EPERM - if the problem happens on writting to the @ip address
 */
-void ftrace_bug(int failed, unsigned long ip)
+void ftrace_bug(int failed, struct dyn_ftrace *rec)
 {
+        unsigned long ip = rec ? rec->ip : 0;
        switch (failed) {
        case -EFAULT:
                FTRACE_WARN_ON_ONCE(1);
@@ -1759,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip)
                pr_info("ftrace failed to modify ");
                print_ip_sym(ip);
                print_ip_ins(" actual: ", (unsigned char *)ip);
-                printk(KERN_CONT "\n");
+                pr_cont("\n");
                break;
        case -EPERM:
                FTRACE_WARN_ON_ONCE(1);
@@ -1771,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip)
                pr_info("ftrace faulted on unknown error ");
                print_ip_sym(ip);
        }
+        if (rec) {
+                struct ftrace_ops *ops = NULL;
+                pr_info("ftrace record flags: %lx\n", rec->flags);
+                pr_cont(" (%ld)%s", ftrace_rec_count(rec),
+                        rec->flags & FTRACE_FL_REGS ? " R" : "  ");
+                if (rec->flags & FTRACE_FL_TRAMP_EN) {
+                        ops = ftrace_find_tramp_ops_any(rec);
+                        if (ops)
+                                pr_cont("\ttramp: %pS",
+                                        (void *)ops->trampoline);
+                        else
+                                pr_cont("\ttramp: ERROR!");
+                }
+                ip = ftrace_get_addr_curr(rec);
+                pr_cont(" expected tramp: %lx\n", ip);
+        }
 }
 static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -2093,7 +2285,7 @@ void __weak ftrace_replace_code(int enable)
        do_for_each_ftrace_rec(pg, rec) {
                failed = __ftrace_replace_code(rec, enable);
                if (failed) {
-                        ftrace_bug(failed, rec->ip);
+                        ftrace_bug(failed, rec);
                        /* Stop processing */
                        return;
                }
@@ -2175,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
 static int
 ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
-        unsigned long ip;
        int ret;
-        ip = rec->ip;
        if (unlikely(ftrace_disabled))
                return 0;
        ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
        if (ret) {
-                ftrace_bug(ret, ip);
+                ftrace_bug(ret, rec);
                return 0;
        }
        return 1;
@@ -2308,18 +2497,24 @@ static void ftrace_run_update_code(int command)
 }
 static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
-                                   struct ftrace_hash *old_hash)
+                                   struct ftrace_ops_hash *old_hash)
 {
        ops->flags |= FTRACE_OPS_FL_MODIFYING;
-        ops->old_hash.filter_hash = old_hash;
+        ops->old_hash.filter_hash = old_hash->filter_hash;
+        ops->old_hash.notrace_hash = old_hash->notrace_hash;
        ftrace_run_update_code(command);
        ops->old_hash.filter_hash = NULL;
+        ops->old_hash.notrace_hash = NULL;
        ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
 }
 static ftrace_func_t saved_ftrace_func;
 static int ftrace_start_up;
+void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+}
 static void control_ops_free(struct ftrace_ops *ops)
 {
        free_percpu(ops->disabled);
@@ -2369,6 +2564,15 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
         */
        ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
+        ret = ftrace_hash_ipmodify_enable(ops);
+        if (ret < 0) {
+                /* Rollback registration process */
+                __unregister_ftrace_function(ops);
+                ftrace_start_up--;
+                ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+                return ret;
+        }
        ftrace_hash_rec_enable(ops, 1);
        ftrace_startup_enable(command);
@@ -2397,6 +2601,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
         */
        WARN_ON_ONCE(ftrace_start_up < 0);
+        /* Disabling ipmodify never fails */
+        ftrace_hash_ipmodify_disable(ops);
        ftrace_hash_rec_disable(ops, 1);
        ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -2471,6 +2677,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
        if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
                schedule_on_each_cpu(ftrace_sync);
+                arch_ftrace_trampoline_free(ops);
                if (ops->flags & FTRACE_OPS_FL_CONTROL)
                        control_ops_free(ops);
        }
@@ -2623,7 +2831,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
                        if (ftrace_start_up && cnt) {
                                int failed = __ftrace_replace_code(p, 1);
                                if (failed)
-                                        ftrace_bug(failed, p->ip);
+                                        ftrace_bug(failed, p);
                        }
                }
        }
@@ -2948,6 +3156,22 @@ static void t_stop(struct seq_file *m, void *p)
        mutex_unlock(&ftrace_lock);
 }
+void * __weak
+arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+        return NULL;
+}
+static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
+                                struct dyn_ftrace *rec)
+{
+        void *ptr;
+        ptr = arch_ftrace_trampoline_func(ops, rec);
+        if (ptr)
+                seq_printf(m, " ->%pS", ptr);
+}
 static int t_show(struct seq_file *m, void *v)
 {
        struct ftrace_iterator *iter = m->private;
@@ -2958,9 +3182,9 @@ static int t_show(struct seq_file *m, void *v)
        if (iter->flags & FTRACE_ITER_PRINTALL) {
                if (iter->flags & FTRACE_ITER_NOTRACE)
-                        seq_printf(m, "#### no functions disabled ####\n");
+                        seq_puts(m, "#### no functions disabled ####\n");
                else
-                        seq_printf(m, "#### all functions enabled ####\n");
+                        seq_puts(m, "#### all functions enabled ####\n");
                return 0;
        }
@@ -2971,22 +3195,25 @@ static int t_show(struct seq_file *m, void *v)
        seq_printf(m, "%ps", (void *)rec->ip);
        if (iter->flags & FTRACE_ITER_ENABLED) {
-                seq_printf(m, " (%ld)%s",
+                struct ftrace_ops *ops = NULL;
+                seq_printf(m, " (%ld)%s%s",
                           ftrace_rec_count(rec),
-                           rec->flags & FTRACE_FL_REGS ? " R" : "  ");
+                           rec->flags & FTRACE_FL_REGS ? " R" : "  ",
+                           rec->flags & FTRACE_FL_IPMODIFY ? " I" : "  ");
                if (rec->flags & FTRACE_FL_TRAMP_EN) {
-                        struct ftrace_ops *ops;
                        ops = ftrace_find_tramp_ops_any(rec);
                        if (ops)
                                seq_printf(m, "\ttramp: %pS",
                                           (void *)ops->trampoline);
                        else
-                                seq_printf(m, "\ttramp: ERROR!");
+                                seq_puts(m, "\ttramp: ERROR!");
                }
+                add_trampoline_func(m, ops, rec);
        }       
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
        return 0;
 }
@@ -3020,9 +3247,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
 {
        struct ftrace_iterator *iter;
-        if (unlikely(ftrace_disabled))
-                return -ENODEV;
        iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
        if (iter) {
                iter->pg = ftrace_pages_start;
@@ -3357,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
 static int ftrace_probe_registered;
-static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
+static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
 {
        int ret;
        int i;
@@ -3415,6 +3639,7 @@ int
 register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                              void *data)
 {
+        struct ftrace_ops_hash old_hash_ops;
        struct ftrace_func_probe *entry;
        struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
        struct ftrace_hash *old_hash = *orig_hash;
@@ -3436,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+        old_hash_ops.filter_hash = old_hash;
+        /* Probes only have filters */
+        old_hash_ops.notrace_hash = NULL;
        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
        if (!hash) {
                count = -ENOMEM;
@@ -3496,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
-        __enable_ftrace_function_probe(old_hash);
+        __enable_ftrace_function_probe(&old_hash_ops);
        if (!ret)
                free_ftrace_hash_rcu(old_hash);
@@ -3784,10 +4013,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
 }
 static void ftrace_ops_update_code(struct ftrace_ops *ops,
-                                   struct ftrace_hash *old_hash)
+                                   struct ftrace_ops_hash *old_hash)
 {
-        if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
+        struct ftrace_ops *op;
+        if (!ftrace_enabled)
+                return;
+        if (ops->flags & FTRACE_OPS_FL_ENABLED) {
                ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
+                return;
+        }
+        /*
+         * If this is the shared global_ops filter, then we need to
+         * check if there is another ops that shares it, is enabled.
+         * If so, we still need to run the modify code.
+         */
+        if (ops->func_hash != &global_ops.local_hash)
+                return;
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                if (op->func_hash == &global_ops.local_hash &&
+                    op->flags & FTRACE_OPS_FL_ENABLED) {
+                        ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
+                        /* Only need to do this once */
+                        return;
+                }
+        } while_for_each_ftrace_op(op);
 }
 static int
@@ -3795,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
                unsigned long ip, int remove, int reset, int enable)
 {
        struct ftrace_hash **orig_hash;
+        struct ftrace_ops_hash old_hash_ops;
        struct ftrace_hash *old_hash;
        struct ftrace_hash *hash;
        int ret;
@@ -3831,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        mutex_lock(&ftrace_lock);
        old_hash = *orig_hash;
+        old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+        old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
        if (!ret) {
-                ftrace_ops_update_code(ops, old_hash);
+                ftrace_ops_update_code(ops, &old_hash_ops);
                free_ftrace_hash_rcu(old_hash);
        }
        mutex_unlock(&ftrace_lock);
@@ -3975,6 +4231,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
 static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static unsigned long save_global_trampoline;
+static unsigned long save_global_flags;
 static int __init set_graph_function(char *str)
 {
        strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -4042,6 +4301,7 @@ static void __init set_ftrace_early_filters(void)
 int ftrace_regex_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct ftrace_ops_hash old_hash_ops;
        struct ftrace_iterator *iter;
        struct ftrace_hash **orig_hash;
        struct ftrace_hash *old_hash;
@@ -4075,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
                mutex_lock(&ftrace_lock);
                old_hash = *orig_hash;
+                old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
+                old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
                ret = ftrace_hash_move(iter->ops, filter_hash,
                                       orig_hash, iter->hash);
                if (!ret) {
-                        ftrace_ops_update_code(iter->ops, old_hash);
+                        ftrace_ops_update_code(iter->ops, &old_hash_ops);
                        free_ftrace_hash_rcu(old_hash);
                }
                mutex_unlock(&ftrace_lock);
@@ -4183,9 +4445,9 @@ static int g_show(struct seq_file *m, void *v)
                struct ftrace_graph_data *fgd = m->private;
                if (fgd->table == ftrace_graph_funcs)
-                        seq_printf(m, "#### all functions enabled ####\n");
+                        seq_puts(m, "#### all functions enabled ####\n");
                else
-                        seq_printf(m, "#### no functions disabled ####\n");
+                        seq_puts(m, "#### no functions disabled ####\n");
                return 0;
        }
@@ -4696,6 +4958,32 @@ void __init ftrace_init(void)
        ftrace_disabled = 1;
 }
+/* Do nothing if arch does not support this */
+void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+/*
+ * Currently there's no safe way to free a trampoline when the kernel
+ * is configured with PREEMPT. That is because a task could be preempted
+ * when it jumped to the trampoline, it may be preempted for a long time
+ * depending on the system load, and currently there's no way to know
+ * when it will be off the trampoline. If the trampoline is freed
+ * too early, when the task runs again, it will be executing on freed
+ * memory and crash.
+ */
+#ifdef CONFIG_PREEMPT
+        /* Currently, only non dynamic ops can have a trampoline */
+        if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+                return;
+#endif
+        arch_ftrace_update_trampoline(ops);
+}
 #else
 static struct ftrace_ops global_ops = {
@@ -4738,6 +5026,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
        return 1;
 }
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
 #endif /* CONFIG_DYNAMIC_FTRACE */
 __init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -5075,12 +5367,12 @@ static int fpid_show(struct seq_file *m, void *v)
        const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
        if (v == (void *)1) {
-                seq_printf(m, "no pid\n");
+                seq_puts(m, "no pid\n");
                return 0;
        }
        if (fpid->pid == ftrace_swapper_pid)
-                seq_printf(m, "swapper tasks\n");
+                seq_puts(m, "swapper tasks\n");
        else
                seq_printf(m, "%u\n", pid_vnr(fpid->pid));
@@ -5293,6 +5585,7 @@ static struct ftrace_ops graph_ops = {
                                   FTRACE_OPS_FL_STUB,
 #ifdef FTRACE_GRAPH_TRAMP_ADDR
        .trampoline             = FTRACE_GRAPH_TRAMP_ADDR,
+        /* trampoline_size is only needed for dynamically allocated tramps */
 #endif
        ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
 };
@@ -5522,7 +5815,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        update_function_graph_func();
        ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
        return ret;
@@ -5543,6 +5835,17 @@ void unregister_ftrace_graph(void)
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+#ifdef CONFIG_DYNAMIC_FTRACE
+        /*
+         * Function graph does not allocate the trampoline, but
+         * other global_ops do. We need to reset the ALLOC_TRAMP flag
+         * if one was used.
+         */
+        global_ops.trampoline = save_global_trampoline;
+        if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
+                global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+#endif
 out:
        mutex_unlock(&ftrace_lock);
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a56e07c8d15b..7a4104cb95cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work);
 */
 int ring_buffer_print_entry_header(struct trace_seq *s)
 {
-        int ret;
+        trace_seq_puts(s, "# compressed entry header\n");
+        trace_seq_puts(s, "\ttype_len    :    5 bits\n");
-        ret = trace_seq_puts(s, "# compressed entry header\n");
+        trace_seq_puts(s, "\ttime_delta  :   27 bits\n");
-        ret = trace_seq_puts(s, "\ttype_len    :    5 bits\n");
+        trace_seq_puts(s, "\tarray       :   32 bits\n");
-        ret = trace_seq_puts(s, "\ttime_delta  :   27 bits\n");
+        trace_seq_putc(s, '\n');
-        ret = trace_seq_puts(s, "\tarray       :   32 bits\n");
+        trace_seq_printf(s, "\tpadding     : type == %d\n",
-        ret = trace_seq_putc(s, '\n');
+                         RINGBUF_TYPE_PADDING);
-        ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
+        trace_seq_printf(s, "\ttime_extend : type == %d\n",
-                               RINGBUF_TYPE_PADDING);
+                         RINGBUF_TYPE_TIME_EXTEND);
-        ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+        trace_seq_printf(s, "\tdata max type_len  == %d\n",
-                               RINGBUF_TYPE_TIME_EXTEND);
+                         RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
-        ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
-                               RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
-        return ret;
+        return !trace_seq_has_overflowed(s);
 }
 /*
@@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta)
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
        struct buffer_data_page field;
-        int ret;
-        ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
-                               "offset:0;\tsize:%u;\tsigned:%u;\n",
-                               (unsigned int)sizeof(field.time_stamp),
-                               (unsigned int)is_signed_type(u64));
-        ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
-                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
-                               (unsigned int)offsetof(typeof(field), commit),
-                               (unsigned int)sizeof(field.commit),
-                               (unsigned int)is_signed_type(long));
-        ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
-                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
-                               (unsigned int)offsetof(typeof(field), commit),
-                               1,
-                               (unsigned int)is_signed_type(long));
-        ret = trace_seq_printf(s, "\tfield: char data;\t"
-                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
-                               (unsigned int)offsetof(typeof(field), data),
-                               (unsigned int)BUF_PAGE_SIZE,
-                               (unsigned int)is_signed_type(char));
-        return ret;
+        trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+                         "offset:0;\tsize:%u;\tsigned:%u;\n",
+                         (unsigned int)sizeof(field.time_stamp),
+                         (unsigned int)is_signed_type(u64));
+        trace_seq_printf(s, "\tfield: local_t commit;\t"
+                         "offset:%u;\tsize:%u;\tsigned:%u;\n",
+                         (unsigned int)offsetof(typeof(field), commit),
+                         (unsigned int)sizeof(field.commit),
+                         (unsigned int)is_signed_type(long));
+        trace_seq_printf(s, "\tfield: int overwrite;\t"
+                         "offset:%u;\tsize:%u;\tsigned:%u;\n",
+                         (unsigned int)offsetof(typeof(field), commit),
+                         1,
+                         (unsigned int)is_signed_type(long));
+        trace_seq_printf(s, "\tfield: char data;\t"
+                         "offset:%u;\tsize:%u;\tsigned:%u;\n",
+                         (unsigned int)offsetof(typeof(field), data),
+                         (unsigned int)BUF_PAGE_SIZE,
+                         (unsigned int)is_signed_type(char));
+        return !trace_seq_has_overflowed(s);
 }
 struct rb_irq_work {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 92f4a6cee172..4a9079b9f082 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running;
 */
 bool __read_mostly tracing_selftest_disabled;
+/* Pipe tracepoints to printk */
+struct trace_iterator *tracepoint_print_iter;
+int tracepoint_printk;
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
        { }
@@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 static int __init stop_trace_on_warning(char *str)
 {
-        __disable_trace_on_warning = 1;
+        if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+                __disable_trace_on_warning = 1;
        return 1;
 }
-__setup("traceoff_on_warning=", stop_trace_on_warning);
+__setup("traceoff_on_warning", stop_trace_on_warning);
 static int __init boot_alloc_snapshot(char *str)
 {
@@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str)
 }
 __setup("trace_clock=", set_trace_boot_clock);
+static int __init set_tracepoint_printk(char *str)
+{
+        if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+                tracepoint_printk = 1;
+        return 1;
+}
+__setup("tp_printk", set_tracepoint_printk);
 unsigned long long ns2usecs(cycle_t nsec)
 {
@@ -938,19 +950,20 @@ out:
        return ret;
 }
+/* TODO add a seq_buf_to_buffer() */
 static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
        int len;
-        if (s->len <= s->readpos)
+        if (trace_seq_used(s) <= s->seq.readpos)
                return -EBUSY;
-        len = s->len - s->readpos;
+        len = trace_seq_used(s) - s->seq.readpos;
        if (cnt > len)
                cnt = len;
-        memcpy(buf, s->buffer + s->readpos, cnt);
+        memcpy(buf, s->buffer + s->seq.readpos, cnt);
-        s->readpos += cnt;
+        s->seq.readpos += cnt;
        return cnt;
 }
@@ -2029,7 +2042,7 @@ void trace_printk_init_buffers(void)
        pr_warning("** trace_printk() being used. Allocating extra memory.  **\n");
        pr_warning("**                                                      **\n");
        pr_warning("** This means that this is a DEBUG kernel and it is     **\n");
-        pr_warning("** unsafe for produciton use.                           **\n");
+        pr_warning("** unsafe for production use.                           **\n");
        pr_warning("**                                                      **\n");
        pr_warning("** If you see this message and you are not debugging    **\n");
        pr_warning("** the kernel, report this immediately to your vendor!  **\n");
@@ -2158,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
                goto out;
        }
-        len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
+        len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
-        if (len > TRACE_BUF_SIZE)
-                goto out;
        local_save_flags(flags);
        size = sizeof(*entry) + len + 1;
@@ -2171,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
        entry = ring_buffer_event_data(event);
        entry->ip = ip;
-        memcpy(&entry->buf, tbuffer, len);
+        memcpy(&entry->buf, tbuffer, len + 1);
-        entry->buf[len] = '\0';
        if (!call_filter_check_discard(call, entry, buffer, event)) {
                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2509,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf,
 static void print_lat_help_header(struct seq_file *m)
 {
-        seq_puts(m, "#                  _------=> CPU#            \n");
+        seq_puts(m, "#                  _------=> CPU#            \n"
-        seq_puts(m, "#                 / _-----=> irqs-off        \n");
+                    "#                 / _-----=> irqs-off        \n"
-        seq_puts(m, "#                | / _----=> need-resched    \n");
+                    "#                | / _----=> need-resched    \n"
-        seq_puts(m, "#                || / _---=> hardirq/softirq \n");
+                    "#                || / _---=> hardirq/softirq \n"
-        seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
+                    "#                ||| / _--=> preempt-depth   \n"
-        seq_puts(m, "#                |||| /     delay             \n");
+                    "#                |||| /     delay            \n"
-        seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
+                    "#  cmd     pid   ||||| time  |   caller      \n"
-        seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
+                    "#     \\   /      |||||  \\    |   /         \n");
 }
 static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2533,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
 static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
 {
        print_event_info(buf, m);
-        seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
+        seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n"
-        seq_puts(m, "#              | |       |          |         |\n");
+                    "#              | |       |          |         |\n");
 }
 static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
 {
        print_event_info(buf, m);
-        seq_puts(m, "#                              _-----=> irqs-off\n");
+        seq_puts(m, "#                              _-----=> irqs-off\n"
-        seq_puts(m, "#                             / _----=> need-resched\n");
+                    "#                             / _----=> need-resched\n"
-        seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
+                    "#                            | / _---=> hardirq/softirq\n"
-        seq_puts(m, "#                            || / _--=> preempt-depth\n");
+                    "#                            || / _--=> preempt-depth\n"
-        seq_puts(m, "#                            ||| /     delay\n");
+                    "#                            ||| /     delay\n"
-        seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
+                    "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
-        seq_puts(m, "#              | |       |   ||||       |         |\n");
+                    "#              | |       |   ||||       |         |\n");
 }
 void
@@ -2649,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
        event = ftrace_find_event(entry->type);
        if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-                if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+                if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-                        if (!trace_print_lat_context(iter))
+                        trace_print_lat_context(iter);
-                                goto partial;
+                else
-                } else {
+                        trace_print_context(iter);
-                        if (!trace_print_context(iter))
-                                goto partial;
-                }
        }
+        if (trace_seq_has_overflowed(s))
+                return TRACE_TYPE_PARTIAL_LINE;
        if (event)
                return event->funcs->trace(iter, sym_flags, event);
-        if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
+        trace_seq_printf(s, "Unknown type %d\n", entry->type);
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
-partial:
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2677,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
        entry = iter->ent;
-        if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+        if (trace_flags & TRACE_ITER_CONTEXT_INFO)
-                if (!trace_seq_printf(s, "%d %d %llu ",
+                trace_seq_printf(s, "%d %d %llu ",
-                                      entry->pid, iter->cpu, iter->ts))
+                                 entry->pid, iter->cpu, iter->ts);
-                        goto partial;
-        }
+        if (trace_seq_has_overflowed(s))
+                return TRACE_TYPE_PARTIAL_LINE;
        event = ftrace_find_event(entry->type);
        if (event)
                return event->funcs->raw(iter, 0, event);
-        if (!trace_seq_printf(s, "%d ?\n", entry->type))
+        trace_seq_printf(s, "%d ?\n", entry->type);
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
-partial:
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2705,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
        entry = iter->ent;
        if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-                SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+                SEQ_PUT_HEX_FIELD(s, entry->pid);
-                SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+                SEQ_PUT_HEX_FIELD(s, iter->cpu);
-                SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+                SEQ_PUT_HEX_FIELD(s, iter->ts);
+                if (trace_seq_has_overflowed(s))
+                        return TRACE_TYPE_PARTIAL_LINE;
        }
        event = ftrace_find_event(entry->type);
@@ -2717,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
                        return ret;
        }
-        SEQ_PUT_FIELD_RET(s, newline);
+        SEQ_PUT_FIELD(s, newline);
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2731,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
        entry = iter->ent;
        if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-                SEQ_PUT_FIELD_RET(s, entry->pid);
+                SEQ_PUT_FIELD(s, entry->pid);
-                SEQ_PUT_FIELD_RET(s, iter->cpu);
+                SEQ_PUT_FIELD(s, iter->cpu);
-                SEQ_PUT_FIELD_RET(s, iter->ts);
+                SEQ_PUT_FIELD(s, iter->ts);
+                if (trace_seq_has_overflowed(s))
+                        return TRACE_TYPE_PARTIAL_LINE;
        }
        event = ftrace_find_event(entry->type);
@@ -2779,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
-        if (iter->lost_events &&
+        if (iter->lost_events) {
-            !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+                trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
-                                 iter->cpu, iter->lost_events))
+                                 iter->cpu, iter->lost_events);
-                return TRACE_TYPE_PARTIAL_LINE;
+                if (trace_seq_has_overflowed(&iter->seq))
+                        return TRACE_TYPE_PARTIAL_LINE;
+        }
        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
@@ -2860,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m)
 {
        if (!ftrace_is_dead())
                return;
-        seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+        seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
-        seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
+                    "#          MAY BE MISSING FUNCTION EVENTS\n");
 }
 #ifdef CONFIG_TRACER_MAX_TRACE
 static void show_snapshot_main_help(struct seq_file *m)
 {
-        seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
+        seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
-        seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+                    "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
-        seq_printf(m, "#                      Takes a snapshot of the main buffer.\n");
+                    "#                      Takes a snapshot of the main buffer.\n"
-        seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
+                    "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
-        seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
+                    "#                      (Doesn't have to be '2' works with any number that\n"
-        seq_printf(m, "#                       is not a '0' or '1')\n");
+                    "#                       is not a '0' or '1')\n");
 }
 static void show_snapshot_percpu_help(struct seq_file *m)
 {
-        seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+        seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
 #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
-        seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+        seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
-        seq_printf(m, "#                      Takes a snapshot of the main buffer for this cpu.\n");
+                    "#                      Takes a snapshot of the main buffer for this cpu.\n");
 #else
-        seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
+        seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
-        seq_printf(m, "#                     Must use main snapshot file to allocate.\n");
+                    "#                     Must use main snapshot file to allocate.\n");
 #endif
-        seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
+        seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
-        seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
+                    "#                      (Doesn't have to be '2' works with any number that\n"
-        seq_printf(m, "#                       is not a '0' or '1')\n");
+                    "#                       is not a '0' or '1')\n");
 }
 static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
 {
        if (iter->tr->allocated_snapshot)
-                seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+                seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
        else
-                seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+                seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
-        seq_printf(m, "# Snapshot commands:\n");
+        seq_puts(m, "# Snapshot commands:\n");
        if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
                show_snapshot_main_help(m);
        else
@@ -3251,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v)
        if (!t)
                return 0;
-        seq_printf(m, "%s", t->name);
+        seq_puts(m, t->name);
        if (t->next)
                seq_putc(m, ' ');
        else
@@ -4314,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
                goto out;
        }
+        trace_seq_init(&iter->seq);
        /*
         * We make a copy of the current tracer to avoid concurrent
         * changes on it while we are reading.
@@ -4507,18 +4520,18 @@ waitagain:
        trace_access_lock(iter->cpu_file);
        while (trace_find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
-                int len = iter->seq.len;
+                int save_len = iter->seq.seq.len;
                ret = print_trace_line(iter);
                if (ret == TRACE_TYPE_PARTIAL_LINE) {
                        /* don't print partial lines */
-                        iter->seq.len = len;
+                        iter->seq.seq.len = save_len;
                        break;
                }
                if (ret != TRACE_TYPE_NO_CONSUME)
                        trace_consume(iter);
-                if (iter->seq.len >= cnt)
+                if (trace_seq_used(&iter->seq) >= cnt)
                        break;
                /*
@@ -4534,7 +4547,7 @@ waitagain:
        /* Now copy what we have to the user */
        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-        if (iter->seq.readpos >= iter->seq.len)
+        if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
                trace_seq_init(&iter->seq);
        /*
@@ -4568,20 +4581,33 @@ static size_t
 tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
 {
        size_t count;
+        int save_len;
        int ret;
        /* Seq buffer is page-sized, exactly what we need. */
        for (;;) {
-                count = iter->seq.len;
+                save_len = iter->seq.seq.len;
                ret = print_trace_line(iter);
-                count = iter->seq.len - count;
-                if (rem < count) {
+                if (trace_seq_has_overflowed(&iter->seq)) {
-                        rem = 0;
+                        iter->seq.seq.len = save_len;
-                        iter->seq.len -= count;
                        break;
                }
+                /*
+                 * This should not be hit, because it should only
+                 * be set if the iter->seq overflowed. But check it
+                 * anyway to be safe.
+                 */
                if (ret == TRACE_TYPE_PARTIAL_LINE) {
-                        iter->seq.len -= count;
+                        iter->seq.seq.len = save_len;
+                        break;
+                }
+                count = trace_seq_used(&iter->seq) - save_len;
+                if (rem < count) {
+                        rem = 0;
+                        iter->seq.seq.len = save_len;
                        break;
                }
@@ -4662,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                /* Copy the data into the page, so we can start over. */
                ret = trace_seq_to_buffer(&iter->seq,
                                          page_address(spd.pages[i]),
-                                          iter->seq.len);
+                                          trace_seq_used(&iter->seq));
                if (ret < 0) {
                        __free_page(spd.pages[i]);
                        break;
                }
                spd.partial[i].offset = 0;
-                spd.partial[i].len = iter->seq.len;
+                spd.partial[i].len = trace_seq_used(&iter->seq);
                trace_seq_init(&iter->seq);
        }
@@ -5668,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "read events: %ld\n", cnt);
-        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+        count = simple_read_from_buffer(ubuf, count, ppos,
+                                        s->buffer, trace_seq_used(s));
        kfree(s);
@@ -5749,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
        seq_printf(m, "%ps:", (void *)ip);
-        seq_printf(m, "snapshot");
+        seq_puts(m, "snapshot");
        if (count == -1)
-                seq_printf(m, ":unlimited\n");
+                seq_puts(m, ":unlimited\n");
        else
                seq_printf(m, ":count=%ld\n", count);
@@ -6417,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
        int ret;
        /* Paranoid: Make sure the parent is the "instances" directory */
-        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
        if (WARN_ON_ONCE(parent != trace_instance_dir))
                return -ENOENT;
@@ -6444,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
        int ret;
        /* Paranoid: Make sure the parent is the "instances" directory */
-        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
        if (WARN_ON_ONCE(parent != trace_instance_dir))
                return -ENOENT;
@@ -6631,11 +6658,19 @@ void
 trace_printk_seq(struct trace_seq *s)
 {
        /* Probably should print a warning here. */
-        if (s->len >= TRACE_MAX_PRINT)
+        if (s->seq.len >= TRACE_MAX_PRINT)
-                s->len = TRACE_MAX_PRINT;
+                s->seq.len = TRACE_MAX_PRINT;
+        /*
+         * More paranoid code. Although the buffer size is set to
+         * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
+         * an extra layer of protection.
+         */
+        if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
+                s->seq.len = s->seq.size - 1;
        /* should be zero ended, but we are paranoid. */
-        s->buffer[s->len] = 0;
+        s->buffer[s->seq.len] = 0;
        printk(KERN_TRACE "%s", s->buffer);
@@ -6874,6 +6909,18 @@ out:
        return ret;
 }
+void __init trace_init(void)
+{
+        if (tracepoint_printk) {
+                tracepoint_print_iter =
+                        kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+                if (WARN_ON(!tracepoint_print_iter))
+                        tracepoint_printk = 0;
+        }
+        tracer_alloc_buffers();
+        trace_event_init();     
+}
 __init static int clear_boot_tracer(void)
 {
        /*
@@ -6893,6 +6940,5 @@ __init static int clear_boot_tracer(void)
        return 0;
 }
-early_initcall(tracer_alloc_buffers);
 fs_initcall(tracer_init_debugfs);
 late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..8de48bac1ce2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
 #include <linux/compiler.h>
+#include <linux/trace_seq.h>
 #ifdef CONFIG_FTRACE_SYSCALLS
 #include <asm/unistd.h>         /* For NR_SYSCALLS           */
@@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
 void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void tracing_sched_switch_trace(struct trace_array *tr,
-                                struct task_struct *prev,
-                                struct task_struct *next,
-                                unsigned long flags, int pc);
-void tracing_sched_wakeup_trace(struct trace_array *tr,
-                                struct task_struct *wakee,
-                                struct task_struct *cur,
-                                unsigned long flags, int pc);
 void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
@@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
-void tracing_sched_switch_assign_trace(struct trace_array *tr);
-void tracing_stop_sched_switch_record(void);
-void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 int is_tracing_stopped(void);
@@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
 extern unsigned long trace_flags;
+extern char trace_find_mark(unsigned long long duration);
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -737,7 +728,7 @@ extern unsigned long trace_flags;
 extern enum print_line_t
 print_graph_function_flags(struct trace_iterator *iter, u32 flags);
 extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
-extern enum print_line_t
+extern void
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 extern void graph_trace_open(struct trace_iterator *iter);
 extern void graph_trace_close(struct trace_iterator *iter);
@@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
 #define perf_ftrace_event_register NULL
 #endif
+#ifdef CONFIG_FTRACE_SYSCALLS
+void init_ftrace_syscalls(void);
+#else
+static inline void init_ftrace_syscalls(void) { }
+#endif
+#ifdef CONFIG_EVENT_TRACING
+void trace_event_init(void);
+#else
+static inline void __init trace_event_init(void) { }
+#endif
+extern struct trace_iterator *tracepoint_print_iter;
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..7d6e2afde669 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
        trace_assign_type(field, iter->ent);
-        if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
+        trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
-                             field->correct ? "  ok  " : " MISS ",
+                         field->correct ? "  ok  " : " MISS ",
-                             field->func,
+                         field->func,
-                             field->file,
+                         field->file,
-                             field->line))
+                         field->line);
-                return TRACE_TYPE_PARTIAL_LINE;
+        return trace_handle_return(&iter->seq);
-        return TRACE_TYPE_HANDLED;
 }
 static void branch_print_header(struct seq_file *s)
 {
        seq_puts(s, "#           TASK-PID    CPU#    TIMESTAMP  CORRECT"
-                "  FUNC:FILE:LINE\n");
+                    "  FUNC:FILE:LINE\n"
-        seq_puts(s, "#              | |       |          |         |   "
+                    "#              | |       |          |         |   "
-                "    |\n");
+                    "    |\n");
 }
 static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[];
 static int annotated_branch_stat_headers(struct seq_file *m)
 {
-        seq_printf(m, " correct incorrect  %% ");
+        seq_puts(m, " correct incorrect  % "
-        seq_printf(m, "       Function                "
+                    "       Function                "
-                              "  File              Line\n"
+                    "  File              Line\n"
-                              " ------- ---------  - "
+                    " ------- ---------  - "
-                              "       --------                "
+                    "       --------                "
-                              "  ----              ----\n");
+                    "  ----              ----\n");
        return 0;
 }
@@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
        seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
        if (percent < 0)
-                seq_printf(m, "  X ");
+                seq_puts(m, "  X ");
        else
                seq_printf(m, "%3ld ", percent);
        seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[];
 static int all_branch_stat_headers(struct seq_file *m)
 {
-        seq_printf(m, "   miss      hit    %% ");
+        seq_puts(m, "   miss      hit    % "
-        seq_printf(m, "       Function                "
+                    "       Function                "
-                              "  File              Line\n"
+                    "  File              Line\n"
-                              " ------- ---------  - "
+                    " ------- ---------  - "
-                              "       --------                "
+                    "       --------                "
-                              "  ----              ----\n");
+                    "  ----              ----\n");
        return 0;
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0cc51edde3a8..b03a0ea77b99 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
 }
 EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+static void output_printk(struct ftrace_event_buffer *fbuffer)
+{
+        struct ftrace_event_call *event_call;
+        struct trace_event *event;
+        unsigned long flags;
+        struct trace_iterator *iter = tracepoint_print_iter;
+        if (!iter)
+                return;
+        event_call = fbuffer->ftrace_file->event_call;
+        if (!event_call || !event_call->event.funcs ||
+            !event_call->event.funcs->trace)
+                return;
+        event = &fbuffer->ftrace_file->event_call->event;
+        spin_lock_irqsave(&tracepoint_iter_lock, flags);
+        trace_seq_init(&iter->seq);
+        iter->ent = fbuffer->entry;
+        event_call->event.funcs->trace(iter, 0, event);
+        trace_seq_putc(&iter->seq, 0);
+        printk("%s", iter->seq.buffer);
+        spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
 void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
 {
+        if (tracepoint_printk)
+                output_printk(fbuffer);
        event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
                                    fbuffer->event, fbuffer->entry,
                                    fbuffer->flags, fbuffer->pc);
@@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
        if (dir) {
                spin_lock(&dir->d_lock);        /* probably unneeded */
-                list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+                list_for_each_entry(child, &dir->d_subdirs, d_child) {
                        if (child->d_inode)     /* probably unneeded */
                                child->d_inode->i_private = NULL;
                }
@@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v)
        case FORMAT_HEADER:
                seq_printf(m, "name: %s\n", ftrace_event_name(call));
                seq_printf(m, "ID: %d\n", call->event.type);
-                seq_printf(m, "format:\n");
+                seq_puts(m, "format:\n");
                return 0;
        case FORMAT_FIELD_SEPERATOR:
@@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
        mutex_unlock(&event_mutex);
        if (file)
-                r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+                r = simple_read_from_buffer(ubuf, cnt, ppos,
+                                            s->buffer, trace_seq_used(s));
        kfree(s);
@@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
        print_subsystem_event_filter(system, s);
-        r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+        r = simple_read_from_buffer(ubuf, cnt, ppos,
+                                    s->buffer, trace_seq_used(s));
        kfree(s);
@@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
        trace_seq_init(s);
        func(s);
-        r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+        r = simple_read_from_buffer(ubuf, cnt, ppos,
+                                    s->buffer, trace_seq_used(s));
        kfree(s);
@@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
                   ftrace_event_name(data->file->event_call));
        if (data->count == -1)
-                seq_printf(m, ":unlimited\n");
+                seq_puts(m, ":unlimited\n");
        else
                seq_printf(m, ":count=%ld\n", data->count);
@@ -2394,12 +2429,39 @@ static __init int event_trace_memsetup(void)
        return 0;
 }
+static __init void
+early_enable_events(struct trace_array *tr, bool disable_first)
+{
+        char *buf = bootup_event_buf;
+        char *token;
+        int ret;
+        while (true) {
+                token = strsep(&buf, ",");
+                if (!token)
+                        break;
+                if (!*token)
+                        continue;
+                /* Restarting syscalls requires that we stop them first */
+                if (disable_first)
+                        ftrace_set_clr_event(tr, token, 0);
+                ret = ftrace_set_clr_event(tr, token, 1);
+                if (ret)
+                        pr_warn("Failed to enable trace event: %s\n", token);
+                /* Put back the comma to allow this to be called again */
+                if (buf)
+                        *(buf - 1) = ',';
+        }
+}
 static __init int event_trace_enable(void)
 {
        struct trace_array *tr = top_trace_array();
        struct ftrace_event_call **iter, *call;
-        char *buf = bootup_event_buf;
-        char *token;
        int ret;
        if (!tr)
@@ -2421,18 +2483,7 @@ static __init int event_trace_enable(void)
         */
        __trace_early_add_events(tr);
-        while (true) {
+        early_enable_events(tr, false);
-                token = strsep(&buf, ",");
-                if (!token)
-                        break;
-                if (!*token)
-                        continue;
-                ret = ftrace_set_clr_event(tr, token, 1);
-                if (ret)
-                        pr_warn("Failed to enable trace event: %s\n", token);
-        }
        trace_printk_start_comm();
@@ -2443,6 +2494,31 @@ static __init int event_trace_enable(void)
        return 0;
 }
+/*
+ * event_trace_enable() is called from trace_event_init() first to
+ * initialize events and perhaps start any events that are on the
+ * command line. Unfortunately, there are some events that will not
+ * start this early, like the system call tracepoints that need
+ * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
+ * is called before pid 1 starts, and this flag is never set, making
+ * the syscall tracepoint never get reached, but the event is enabled
+ * regardless (and not doing anything).
+ */
+static __init int event_trace_enable_again(void)
+{
+        struct trace_array *tr;
+        tr = top_trace_array();
+        if (!tr)
+                return -ENODEV;
+        early_enable_events(tr, true);
+        return 0;
+}
+early_initcall(event_trace_enable_again);
 static __init int event_trace_init(void)
 {
        struct trace_array *tr;
@@ -2477,8 +2553,14 @@ static __init int event_trace_init(void)
 #endif
        return 0;
 }
-early_initcall(event_trace_memsetup);
-core_initcall(event_trace_enable);
+void __init trace_event_init(void)
+{
+        event_trace_memsetup();
+        init_ftrace_syscalls();
+        event_trace_enable();
+}
 fs_initcall(event_trace_init);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7a8c1528e141..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
        OP_GT,
        OP_GE,
        OP_BAND,
+        OP_NOT,
        OP_NONE,
        OP_OPEN_PAREN,
 };
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
        { OP_GT,        ">",            5 },
        { OP_GE,        ">=",           5 },
        { OP_BAND,      "&",            6 },
+        { OP_NOT,       "!",            6 },
        { OP_NONE,      "OP_NONE",      0 },
        { OP_OPEN_PAREN, "(",           0 },
 };
@@ -85,6 +87,7 @@ enum {
        FILT_ERR_MISSING_FIELD,
        FILT_ERR_INVALID_FILTER,
        FILT_ERR_IP_FIELD_ONLY,
+        FILT_ERR_ILLEGAL_NOT_OP,
 };
 static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
        "Missing field name and/or value",
        "Meaningless filter expression",
        "Only 'ip' field is supported for function trace",
+        "Illegal use of '!'",
 };
 struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
        int                     index;
 };
+/* If not of not match is equal to not of not, then it is a match */
 #define DEFINE_COMPARISON_PRED(type)                                    \
 static int filter_pred_##type(struct filter_pred *pred, void *event)    \
 {                                                                       \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event)	\
                break;                                                  \
        }                                                               \
                                                                        \
-        return match;                                                   \
+        return !!match == !pred->not;                                   \
 }
 #define DEFINE_EQUALITY_PRED(size)                                      \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
                if (!WARN_ON_ONCE(!pred->fn))
                        match = pred->fn(pred, rec);
                if (!!match == type)
-                        return match;
+                        break;
        }
-        return match;
+        /* If not of not match is equal to not of not, then it is a match */
+        return !!match == !op->not;
 }
 struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
                 * then this op can be folded.
                 */
                if (left->index & FILTER_PRED_FOLD &&
-                    (left->op == dest->op ||
+                    ((left->op == dest->op && !left->not) ||
                     left->left == FILTER_PRED_INVALID) &&
                    right->index & FILTER_PRED_FOLD &&
-                    (right->op == dest->op ||
+                    ((right->op == dest->op && !right->not) ||
                     right->left == FILTER_PRED_INVALID))
                        dest->index |= FILTER_PRED_FOLD;
@@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
        }
        if (pred->op == OP_NE)
-                pred->not = 1;
+                pred->not ^= 1;
        pred->fn = fn;
        return 0;
@@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
                        continue;
                }
+                if (elt->op == OP_NOT) {
+                        if (!n_preds || operand1 || operand2) {
+                                parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
+                                err = -EINVAL;
+                                goto fail;
+                        }
+                        if (!dry_run)
+                                filter->preds[n_preds - 1].not ^= 1;
+                        continue;
+                }
                if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
                        parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
                        err = -ENOSPC;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
 {
        long count = (long)data;
-        seq_printf(m, "%s", name);
+        seq_puts(m, name);
        if (count == -1)
                seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
        if (filter_str)
                seq_printf(m, " if %s\n", filter_str);
        else
-                seq_puts(m, "\n");
+                seq_putc(m, '\n');
        return 0;
 }
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
        if (data->filter_str)
                seq_printf(m, " if %s\n", data->filter_str);
        else
-                seq_puts(m, "\n");
+                seq_putc(m, '\n');
        return 0;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
 };
 #ifdef CONFIG_DYNAMIC_FTRACE
-static int update_count(void **data)
+static void update_traceon_count(void **data, bool on)
 {
-        unsigned long *count = (long *)data;
+        long *count = (long *)data;
+        long old_count = *count;
-        if (!*count)
+        /*
-                return 0;
+         * Tracing gets disabled (or enabled) once per count.
+         * This function can be called at the same time on multiple CPUs.
+         * It is fine if both disable (or enable) tracing, as disabling
+         * (or enabling) the second time doesn't do anything as the
+         * state of the tracer is already disabled (or enabled).
+         * What needs to be synchronized in this case is that the count
+         * only gets decremented once, even if the tracer is disabled
+         * (or enabled) twice, as the second one is really a nop.
+         *
+         * The memory barriers guarantee that we only decrement the
+         * counter once. First the count is read to a local variable
+         * and a read barrier is used to make sure that it is loaded
+         * before checking if the tracer is in the state we want.
+         * If the tracer is not in the state we want, then the count
+         * is guaranteed to be the old count.
+         *
+         * Next the tracer is set to the state we want (disabled or enabled)
+         * then a write memory barrier is used to make sure that
+         * the new state is visible before changing the counter by
+         * one minus the old counter. This guarantees that another CPU
+         * executing this code will see the new state before seeing
+         * the new counter value, and would not do anything if the new
+         * counter is seen.
+         *
+         * Note, there is no synchronization between this and a user
+         * setting the tracing_on file. But we currently don't care
+         * about that.
+         */
+        if (!old_count)
+                return;
-        if (*count != -1)
+        /* Make sure we see count before checking tracing state */
-                (*count)--;
+        smp_rmb();
-        return 1;
+        if (on == !!tracing_is_on())
+                return;
+        if (on)
+                tracing_on();
+        else
+                tracing_off();
+        /* unlimited? */
+        if (old_count == -1)
+                return;
+        /* Make sure tracing state is visible before updating count */
+        smp_wmb();
+        *count = old_count - 1;
 }
 static void
 ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
 {
-        if (tracing_is_on())
+        update_traceon_count(data, 1);
-                return;
-        if (update_count(data))
-                tracing_on();
 }
 static void
 ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
 {
-        if (!tracing_is_on())
+        update_traceon_count(data, 0);
-                return;
-        if (update_count(data))
-                tracing_off();
 }
 static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
 static void
 ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
 {
-        if (!tracing_is_on())
+        long *count = (long *)data;
-                return;
+        long old_count;
+        long new_count;
-        if (update_count(data))
+        /*
-                trace_dump_stack(STACK_SKIP);
+         * Stack traces should only execute the number of times the
+         * user specified in the counter.
+         */
+        do {
+                if (!tracing_is_on())
+                        return;
+                old_count = *count;
+                if (!old_count)
+                        return;
+                /* unlimited? */
+                if (old_count == -1) {
+                        trace_dump_stack(STACK_SKIP);
+                        return;
+                }
+                new_count = old_count - 1;
+                new_count = cmpxchg(count, old_count, new_count);
+                if (new_count == old_count)
+                        trace_dump_stack(STACK_SKIP);
+        } while (new_count != old_count);
+}
+static int update_count(void **data)
+{
+        unsigned long *count = (long *)data;
+        if (!*count)
+                return 0;
+        if (*count != -1)
+                (*count)--;
+        return 1;
 }
 static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
        seq_printf(m, "%ps:%s", (void *)ip, name);
        if (count == -1)
-                seq_printf(m, ":unlimited\n");
+                seq_puts(m, ":unlimited\n");
        else
                seq_printf(m, ":count=%ld\n", count);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f0a0c982cde3..ba476009e5de 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -107,7 +107,7 @@ enum {
        FLAGS_FILL_END   = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
 };
-static enum print_line_t
+static void
 print_graph_duration(unsigned long long duration, struct trace_seq *s,
                     u32 flags);
@@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr)
 static int max_bytes_for_cpu;
-static enum print_line_t
+static void print_graph_cpu(struct trace_seq *s, int cpu)
-print_graph_cpu(struct trace_seq *s, int cpu)
 {
-        int ret;
        /*
         * Start with a space character - to make it stand out
         * to the right a bit when trace output is pasted into
         * email:
         */
-        ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
+        trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
 }
 #define TRACE_GRAPH_PROCINFO_LENGTH     14
-static enum print_line_t
+static void print_graph_proc(struct trace_seq *s, pid_t pid)
-print_graph_proc(struct trace_seq *s, pid_t pid)
 {
        char comm[TASK_COMM_LEN];
        /* sign + log10(MAX_INT) + '\0' */
        char pid_str[11];
        int spaces = 0;
-        int ret;
        int len;
        int i;
@@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
                spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
        /* First spaces to align center */
-        for (i = 0; i < spaces / 2; i++) {
+        for (i = 0; i < spaces / 2; i++)
-                ret = trace_seq_putc(s, ' ');
+                trace_seq_putc(s, ' ');
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
-        ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
+        trace_seq_printf(s, "%s-%s", comm, pid_str);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
        /* Last spaces to align center */
-        for (i = 0; i < spaces - (spaces / 2); i++) {
+        for (i = 0; i < spaces - (spaces / 2); i++)
-                ret = trace_seq_putc(s, ' ');
+                trace_seq_putc(s, ' ');
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
-        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t
+static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
-print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
-        if (!trace_seq_putc(s, ' '))
+        trace_seq_putc(s, ' ');
-                return 0;
+        trace_print_lat_fmt(s, entry);
-        return trace_print_lat_fmt(s, entry);
 }
 /* If the pid changed since the last trace, output this event */
-static enum print_line_t
+static void
 verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
 {
        pid_t prev_pid;
        pid_t *last_pid;
-        int ret;
        if (!data)
-                return TRACE_TYPE_HANDLED;
+                return;
        last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
        if (*last_pid == pid)
-                return TRACE_TYPE_HANDLED;
+                return;
        prev_pid = *last_pid;
        *last_pid = pid;
        if (prev_pid == -1)
-                return TRACE_TYPE_HANDLED;
+                return;
 /*
 * Context-switch trace line:
@@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
 ------------------------------------------
 */
-        ret = trace_seq_puts(s,
+        trace_seq_puts(s, " ------------------------------------------\n");
-                " ------------------------------------------\n");
+        print_graph_cpu(s, cpu);
-        if (!ret)
+        print_graph_proc(s, prev_pid);
-                return TRACE_TYPE_PARTIAL_LINE;
+        trace_seq_puts(s, " => ");
+        print_graph_proc(s, pid);
-        ret = print_graph_cpu(s, cpu);
+        trace_seq_puts(s, "\n ------------------------------------------\n\n");
-        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ret = print_graph_proc(s, prev_pid);
-        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ret = trace_seq_puts(s, " => ");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ret = print_graph_proc(s, pid);
-        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ret = trace_seq_puts(s,
-                "\n ------------------------------------------\n\n");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
 }
 static struct ftrace_graph_ret_entry *
@@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter,
        return next;
 }
-static int print_graph_abs_time(u64 t, struct trace_seq *s)
+static void print_graph_abs_time(u64 t, struct trace_seq *s)
 {
        unsigned long usecs_rem;
        usecs_rem = do_div(t, NSEC_PER_SEC);
        usecs_rem /= 1000;
-        return trace_seq_printf(s, "%5lu.%06lu |  ",
+        trace_seq_printf(s, "%5lu.%06lu |  ",
-                        (unsigned long)t, usecs_rem);
+                         (unsigned long)t, usecs_rem);
 }
-static enum print_line_t
+static void
 print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                enum trace_type type, int cpu, pid_t pid, u32 flags)
 {
-        int ret;
        struct trace_seq *s = &iter->seq;
+        struct trace_entry *ent = iter->ent;
        if (addr < (unsigned long)__irqentry_text_start ||
                addr >= (unsigned long)__irqentry_text_end)
-                return TRACE_TYPE_UNHANDLED;
+                return;
        if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
                /* Absolute time */
-                if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
+                if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
-                        ret = print_graph_abs_time(iter->ts, s);
+                        print_graph_abs_time(iter->ts, s);
-                        if (!ret)
-                                return TRACE_TYPE_PARTIAL_LINE;
-                }
                /* Cpu */
-                if (flags & TRACE_GRAPH_PRINT_CPU) {
+                if (flags & TRACE_GRAPH_PRINT_CPU)
-                        ret = print_graph_cpu(s, cpu);
+                        print_graph_cpu(s, cpu);
-                        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                                return TRACE_TYPE_PARTIAL_LINE;
-                }
                /* Proc */
                if (flags & TRACE_GRAPH_PRINT_PROC) {
-                        ret = print_graph_proc(s, pid);
+                        print_graph_proc(s, pid);
-                        if (ret == TRACE_TYPE_PARTIAL_LINE)
+                        trace_seq_puts(s, " | ");
-                                return TRACE_TYPE_PARTIAL_LINE;
-                        ret = trace_seq_puts(s, " | ");
-                        if (!ret)
-                                return TRACE_TYPE_PARTIAL_LINE;
                }
+                /* Latency format */
+                if (trace_flags & TRACE_ITER_LATENCY_FMT)
+                        print_graph_lat_fmt(s, ent);
        }
        /* No overhead */
-        ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
+        print_graph_duration(0, s, flags | FLAGS_FILL_START);
-        if (ret != TRACE_TYPE_HANDLED)
-                return ret;
        if (type == TRACE_GRAPH_ENT)
-                ret = trace_seq_puts(s, "==========>");
+                trace_seq_puts(s, "==========>");
        else
-                ret = trace_seq_puts(s, "<==========");
+                trace_seq_puts(s, "<==========");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
-        if (ret != TRACE_TYPE_HANDLED)
-                return ret;
-        ret = trace_seq_putc(s, '\n');
-        if (!ret)
+        print_graph_duration(0, s, flags | FLAGS_FILL_END);
-                return TRACE_TYPE_PARTIAL_LINE;
+        trace_seq_putc(s, '\n');
-        return TRACE_TYPE_HANDLED;
 }
-enum print_line_t
+void
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
        unsigned long nsecs_rem = do_div(duration, 1000);
        /* log10(ULONG_MAX) + '\0' */
-        char msecs_str[21];
+        char usecs_str[21];
        char nsecs_str[5];
-        int ret, len;
+        int len;
        int i;
-        sprintf(msecs_str, "%lu", (unsigned long) duration);
+        sprintf(usecs_str, "%lu", (unsigned long) duration);
        /* Print msecs */
-        ret = trace_seq_printf(s, "%s", msecs_str);
+        trace_seq_printf(s, "%s", usecs_str);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        len = strlen(msecs_str);
+        len = strlen(usecs_str);
        /* Print nsecs (we don't want to exceed 7 numbers) */
        if (len < 7) {
                size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
                snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
-                ret = trace_seq_printf(s, ".%s", nsecs_str);
+                trace_seq_printf(s, ".%s", nsecs_str);
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
                len += strlen(nsecs_str);
        }
-        ret = trace_seq_puts(s, " us ");
+        trace_seq_puts(s, " us ");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
        /* Print remaining spaces to fit the row's width */
-        for (i = len; i < 7; i++) {
+        for (i = len; i < 7; i++)
-                ret = trace_seq_putc(s, ' ');
+                trace_seq_putc(s, ' ');
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
-        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t
+static void
 print_graph_duration(unsigned long long duration, struct trace_seq *s,
                     u32 flags)
 {
-        int ret = -1;
        if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
            !(trace_flags & TRACE_ITER_CONTEXT_INFO))
-                        return TRACE_TYPE_HANDLED;
+                return;
        /* No real adata, just filling the column with spaces */
        switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
        case FLAGS_FILL_FULL:
-                ret = trace_seq_puts(s, "              |  ");
+                trace_seq_puts(s, "              |  ");
-                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+                return;
        case FLAGS_FILL_START:
-                ret = trace_seq_puts(s, "  ");
+                trace_seq_puts(s, "  ");
-                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+                return;
        case FLAGS_FILL_END:
-                ret = trace_seq_puts(s, " |");
+                trace_seq_puts(s, " |");
-                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+                return;
        }
        /* Signal a overhead of time execution to the output */
-        if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
+        if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
-                /* Duration exceeded 100 msecs */
+                trace_seq_printf(s, "%c ", trace_find_mark(duration));
-                if (duration > 100000ULL)
+        else
-                        ret = trace_seq_puts(s, "! ");
+                trace_seq_puts(s, "  ");
-                /* Duration exceeded 10 msecs */
-                else if (duration > 10000ULL)
-                        ret = trace_seq_puts(s, "+ ");
-        }
-        /*
-         * The -1 means we either did not exceed the duration tresholds
-         * or we dont want to print out the overhead. Either way we need
-         * to fill out the space.
-         */
-        if (ret == -1)
-                ret = trace_seq_puts(s, "  ");
-        /* Catching here any failure happenned above */
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ret = trace_print_graph_duration(duration, s);
-        if (ret != TRACE_TYPE_HANDLED)
-                return ret;
-        ret = trace_seq_puts(s, "|  ");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        trace_print_graph_duration(duration, s);
+        trace_seq_puts(s, "|  ");
 }
 /* Case of a leaf function on its call entry */
@@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        struct ftrace_graph_ret *graph_ret;
        struct ftrace_graph_ent *call;
        unsigned long long duration;
-        int ret;
        int i;
        graph_ret = &ret_entry->ret;
@@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        }
        /* Overhead and duration */
-        ret = print_graph_duration(duration, s, flags);
+        print_graph_duration(duration, s, flags);
-        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                return TRACE_TYPE_PARTIAL_LINE;
        /* Function */
-        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
-                ret = trace_seq_putc(s, ' ');
+                trace_seq_putc(s, ' ');
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
-        ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
+        trace_seq_printf(s, "%ps();\n", (void *)call->func);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static enum print_line_t
@@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
 {
        struct ftrace_graph_ent *call = &entry->graph_ent;
        struct fgraph_data *data = iter->private;
-        int ret;
        int i;
        if (data) {
@@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
        }
        /* No time */
-        ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+        print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
-        if (ret != TRACE_TYPE_HANDLED)
-                return ret;
        /* Function */
-        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
-                ret = trace_seq_putc(s, ' ');
+                trace_seq_putc(s, ' ');
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
+        trace_seq_printf(s, "%ps() {\n", (void *)call->func);
-        }
-        ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
+        if (trace_seq_has_overflowed(s))
-        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /*
@@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
        return TRACE_TYPE_NO_CONSUME;
 }
-static enum print_line_t
+static void
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
                     int type, unsigned long addr, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct trace_entry *ent = iter->ent;
        int cpu = iter->cpu;
-        int ret;
        /* Pid */
-        if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
+        verif_pid(s, ent->pid, cpu, data);
-                return TRACE_TYPE_PARTIAL_LINE;
-        if (type) {
+        if (type)
                /* Interrupt */
-                ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
+                print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
-                return 0;
+                return;
        /* Absolute time */
-        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
-                ret = print_graph_abs_time(iter->ts, s);
+                print_graph_abs_time(iter->ts, s);
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Cpu */
-        if (flags & TRACE_GRAPH_PRINT_CPU) {
+        if (flags & TRACE_GRAPH_PRINT_CPU)
-                ret = print_graph_cpu(s, cpu);
+                print_graph_cpu(s, cpu);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Proc */
        if (flags & TRACE_GRAPH_PRINT_PROC) {
-                ret = print_graph_proc(s, ent->pid);
+                print_graph_proc(s, ent->pid);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
+                trace_seq_puts(s, " | ");
-                        return TRACE_TYPE_PARTIAL_LINE;
-                ret = trace_seq_puts(s, " | ");
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Latency format */
-        if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+        if (trace_flags & TRACE_ITER_LATENCY_FMT)
-                ret = print_graph_lat_fmt(s, ent);
+                print_graph_lat_fmt(s, ent);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
-        return 0;
+        return;
 }
 /*
@@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
        if (check_irq_entry(iter, flags, call->func, call->depth))
                return TRACE_TYPE_HANDLED;
-        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
+        print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
-                return TRACE_TYPE_PARTIAL_LINE;
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
@@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        pid_t pid = ent->pid;
        int cpu = iter->cpu;
        int func_match = 1;
-        int ret;
        int i;
        if (check_irq_return(iter, flags, trace->depth))
@@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
                }
        }
-        if (print_graph_prologue(iter, s, 0, 0, flags))
+        print_graph_prologue(iter, s, 0, 0, flags);
-                return TRACE_TYPE_PARTIAL_LINE;
        /* Overhead and duration */
-        ret = print_graph_duration(duration, s, flags);
+        print_graph_duration(duration, s, flags);
-        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                return TRACE_TYPE_PARTIAL_LINE;
        /* Closing brace */
-        for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
+        for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
-                ret = trace_seq_putc(s, ' ');
+                trace_seq_putc(s, ' ');
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /*
         * If the return function does not have a matching entry,
@@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
         * belongs to, write out the function name. Always do
         * that if the funcgraph-tail option is enabled.
         */
-        if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
+        if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
-                ret = trace_seq_puts(s, "}\n");
+                trace_seq_puts(s, "}\n");
-                if (!ret)
+        else
-                        return TRACE_TYPE_PARTIAL_LINE;
+                trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
-        } else {
-                ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Overrun */
-        if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
+        if (flags & TRACE_GRAPH_PRINT_OVERRUN)
-                ret = trace_seq_printf(s, " (Overruns: %lu)\n",
+                trace_seq_printf(s, " (Overruns: %lu)\n",
-                                        trace->overrun);
+                                 trace->overrun);
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
-        ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+        print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
-                              cpu, pid, flags);
+                        cpu, pid, flags);
-        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static enum print_line_t
@@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
        if (data)
                depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
-        if (print_graph_prologue(iter, s, 0, 0, flags))
+        print_graph_prologue(iter, s, 0, 0, flags);
-                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+        print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
-        if (ret != TRACE_TYPE_HANDLED)
-                return ret;
        /* Indentation */
        if (depth > 0)
-                for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
+                for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
-                        ret = trace_seq_putc(s, ' ');
+                        trace_seq_putc(s, ' ');
-                        if (!ret)
-                                return TRACE_TYPE_PARTIAL_LINE;
-                }
        /* The comment */
-        ret = trace_seq_puts(s, "/* ");
+        trace_seq_puts(s, "/* ");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
        switch (iter->ent->type) {
        case TRACE_BPRINT:
@@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
                        return ret;
        }
+        if (trace_seq_has_overflowed(s))
+                goto out;
        /* Strip ending newline */
-        if (s->buffer[s->len - 1] == '\n') {
+        if (s->buffer[s->seq.len - 1] == '\n') {
-                s->buffer[s->len - 1] = '\0';
+                s->buffer[s->seq.len - 1] = '\0';
-                s->len--;
+                s->seq.len--;
        }
-        ret = trace_seq_puts(s, " */\n");
+        trace_seq_puts(s, " */\n");
-        if (!ret)
+ out:
-                return TRACE_TYPE_PARTIAL_LINE;
+        return trace_handle_return(s);
-        return TRACE_TYPE_HANDLED;
 }
@@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
                print_lat_header(s, flags);
        /* 1st line */
-        seq_printf(s, "#");
+        seq_putc(s, '#');
        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
-                seq_printf(s, "     TIME       ");
+                seq_puts(s, "     TIME       ");
        if (flags & TRACE_GRAPH_PRINT_CPU)
-                seq_printf(s, " CPU");
+                seq_puts(s, " CPU");
        if (flags & TRACE_GRAPH_PRINT_PROC)
-                seq_printf(s, "  TASK/PID       ");
+                seq_puts(s, "  TASK/PID       ");
        if (lat)
-                seq_printf(s, "||||");
+                seq_puts(s, "||||");
        if (flags & TRACE_GRAPH_PRINT_DURATION)
-                seq_printf(s, "  DURATION   ");
+                seq_puts(s, "  DURATION   ");
-        seq_printf(s, "               FUNCTION CALLS\n");
+        seq_puts(s, "               FUNCTION CALLS\n");
        /* 2nd line */
-        seq_printf(s, "#");
+        seq_putc(s, '#');
        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
-                seq_printf(s, "      |         ");
+                seq_puts(s, "      |         ");
        if (flags & TRACE_GRAPH_PRINT_CPU)
-                seq_printf(s, " |  ");
+                seq_puts(s, " |  ");
        if (flags & TRACE_GRAPH_PRINT_PROC)
-                seq_printf(s, "   |    |        ");
+                seq_puts(s, "   |    |        ");
        if (lat)
-                seq_printf(s, "||||");
+                seq_puts(s, "||||");
        if (flags & TRACE_GRAPH_PRINT_DURATION)
-                seq_printf(s, "   |   |      ");
+                seq_puts(s, "   |   |      ");
-        seq_printf(s, "               |   |   |   |\n");
+        seq_puts(s, "               |   |   |   |\n");
 }
 static void print_graph_headers(struct seq_file *s)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 {
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
+        static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
        unsigned int old_userobj;
        int cnt = 0, cpu;
        trace_init_global_iter(&iter);
+        iter.buffer_iter = buffer_iter;
        for_each_tracing_cpu(cpu) {
                atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
                ring_buffer_read_start(iter.buffer_iter[cpu_file]);
                tracing_iter_reset(&iter, cpu_file);
        }
-        if (!trace_empty(&iter))
-                trace_find_next_entry_inc(&iter);
+        while (trace_find_next_entry_inc(&iter)) {
-        while (!trace_empty(&iter)) {
                if (!cnt)
                        kdb_printf("---------------------------------\n");
                cnt++;
-                if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+                if (!skip_lines) {
                        print_trace_line(&iter);
-                if (!skip_lines)
                        trace_printk_seq(&iter.seq);
-                else
+                } else {
                        skip_lines--;
+                }
                if (KDB_FLAG(CMD_INTERRUPT))
                        goto out;
        }
@@ -86,9 +88,12 @@ out:
                atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
        }
-        for_each_tracing_cpu(cpu)
+        for_each_tracing_cpu(cpu) {
-                if (iter.buffer_iter[cpu])
+                if (iter.buffer_iter[cpu]) {
                        ring_buffer_read_finish(iter.buffer_iter[cpu]);
+                        iter.buffer_iter[cpu] = NULL;
+                }
+        }
 }
 /*
@@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
 static __init int kdb_ftrace_register(void)
 {
-        kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+        kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
-                            "Dump ftrace log", 0, KDB_REPEAT_NONE);
+                            "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
        return 0;
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..5edb518be345 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
        struct trace_kprobe *tk = v;
        int i;
-        seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
+        seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
        seq_printf(m, ":%s/%s", tk->tp.call.class->system,
                        ftrace_event_name(&tk->tp.call));
@@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
        for (i = 0; i < tk->tp.nr_args; i++)
                seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
        return 0;
 }
@@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
        field = (struct kprobe_trace_entry_head *)iter->ent;
        tp = container_of(event, struct trace_probe, call.event);
-        if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
+        trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
-                goto partial;
        if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
-                goto partial;
+                goto out;
-        if (!trace_seq_puts(s, ")"))
+        trace_seq_putc(s, ')');
-                goto partial;
        data = (u8 *)&field[1];
        for (i = 0; i < tp->nr_args; i++)
                if (!tp->args[i].type->print(s, tp->args[i].name,
                                             data + tp->args[i].offset, field))
-                        goto partial;
+                        goto out;
-        if (!trace_seq_puts(s, "\n"))
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+        trace_seq_putc(s, '\n');
-partial:
+ out:
-        return TRACE_TYPE_PARTIAL_LINE;
+        return trace_handle_return(s);
 }
 static enum print_line_t
@@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
        field = (struct kretprobe_trace_entry_head *)iter->ent;
        tp = container_of(event, struct trace_probe, call.event);
-        if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
+        trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
-                goto partial;
        if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
-                goto partial;
+                goto out;
-        if (!trace_seq_puts(s, " <- "))
+        trace_seq_puts(s, " <- ");
-                goto partial;
        if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
-                goto partial;
+                goto out;
-        if (!trace_seq_puts(s, ")"))
+        trace_seq_putc(s, ')');
-                goto partial;
        data = (u8 *)&field[1];
        for (i = 0; i < tp->nr_args; i++)
                if (!tp->args[i].type->print(s, tp->args[i].name,
                                             data + tp->args[i].offset, field))
-                        goto partial;
+                        goto out;
-        if (!trace_seq_puts(s, "\n"))
+        trace_seq_putc(s, '\n');
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+ out:
-partial:
+        return trace_handle_return(s);
-        return TRACE_TYPE_PARTIAL_LINE;
 }
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
        mmio_reset_data(tr);
 }
-static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
 {
-        int ret = 0;
        int i;
        resource_size_t start, end;
        const struct pci_driver *drv = pci_dev_driver(dev);
-        /* XXX: incomplete checks for trace_seq_printf() return value */
+        trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
-        ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+                         dev->bus->number, dev->devfn,
-                                dev->bus->number, dev->devfn,
+                         dev->vendor, dev->device, dev->irq);
-                                dev->vendor, dev->device, dev->irq);
        /*
         * XXX: is pci_resource_to_user() appropriate, since we are
         * supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
         */
        for (i = 0; i < 7; i++) {
                pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
-                ret += trace_seq_printf(s, " %llx",
+                trace_seq_printf(s, " %llx",
                        (unsigned long long)(start |
                        (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
        }
        for (i = 0; i < 7; i++) {
                pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
-                ret += trace_seq_printf(s, " %llx",
+                trace_seq_printf(s, " %llx",
                        dev->resource[i].start < dev->resource[i].end ?
                        (unsigned long long)(end - start) + 1 : 0);
        }
        if (drv)
-                ret += trace_seq_printf(s, " %s\n", drv->name);
+                trace_seq_printf(s, " %s\n", drv->name);
        else
-                ret += trace_seq_puts(s, " \n");
+                trace_seq_puts(s, " \n");
-        return ret;
 }
 static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
        unsigned long long t    = ns2usecs(iter->ts);
        unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
        unsigned secs           = (unsigned long)t;
-        int ret = 1;
        trace_assign_type(field, entry);
        rw = &field->rw;
        switch (rw->opcode) {
        case MMIO_READ:
-                ret = trace_seq_printf(s,
+                trace_seq_printf(s,
                        "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
                        rw->width, secs, usec_rem, rw->map_id,
                        (unsigned long long)rw->phys,
                        rw->value, rw->pc, 0);
                break;
        case MMIO_WRITE:
-                ret = trace_seq_printf(s,
+                trace_seq_printf(s,
                        "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
                        rw->width, secs, usec_rem, rw->map_id,
                        (unsigned long long)rw->phys,
                        rw->value, rw->pc, 0);
                break;
        case MMIO_UNKNOWN_OP:
-                ret = trace_seq_printf(s,
+                trace_seq_printf(s,
                        "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
                        "%02lx 0x%lx %d\n",
                        secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
                        (rw->value >> 0) & 0xff, rw->pc, 0);
                break;
        default:
-                ret = trace_seq_puts(s, "rw what?\n");
+                trace_seq_puts(s, "rw what?\n");
                break;
        }
-        if (ret)
-                return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
        unsigned long long t    = ns2usecs(iter->ts);
        unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
        unsigned secs           = (unsigned long)t;
-        int ret;
        trace_assign_type(field, entry);
        m = &field->map;
        switch (m->opcode) {
        case MMIO_PROBE:
-                ret = trace_seq_printf(s,
+                trace_seq_printf(s,
                        "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
                        secs, usec_rem, m->map_id,
                        (unsigned long long)m->phys, m->virt, m->len,
                        0UL, 0);
                break;
        case MMIO_UNPROBE:
-                ret = trace_seq_printf(s,
+                trace_seq_printf(s,
                        "UNMAP %u.%06lu %d 0x%lx %d\n",
                        secs, usec_rem, m->map_id, 0UL, 0);
                break;
        default:
-                ret = trace_seq_puts(s, "map what?\n");
+                trace_seq_puts(s, "map what?\n");
                break;
        }
-        if (ret)
-                return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
        unsigned long long t    = ns2usecs(iter->ts);
        unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
        unsigned secs           = (unsigned long)t;
-        int ret;
        /* The trailing newline must be in the message. */
-        ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
+        trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..b77b9a697619 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry = iter->ent;
        struct bputs_entry *field;
-        int ret;
        trace_assign_type(field, entry);
-        ret = trace_seq_puts(s, field->str);
+        trace_seq_puts(s, field->str);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry = iter->ent;
        struct bprint_entry *field;
-        int ret;
        trace_assign_type(field, entry);
-        ret = trace_seq_bprintf(s, field->fmt, field->buf);
+        trace_seq_bprintf(s, field->fmt, field->buf);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry = iter->ent;
        struct print_entry *field;
-        int ret;
        trace_assign_type(field, entry);
-        ret = trace_seq_puts(s, field->buf);
+        trace_seq_puts(s, field->buf);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 const char *
@@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
        if (ret == (const char *)(trace_seq_buffer_ptr(p)))
                trace_seq_printf(p, "0x%lx", val);
-                
        trace_seq_putc(p, 0);
        return ret;
@@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
        struct trace_seq *s = &iter->seq;
        struct trace_seq *p = &iter->tmp_seq;
        struct trace_entry *entry;
-        int ret;
        event = container_of(trace_event, struct ftrace_event_call, event);
        entry = iter->ent;
@@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
        }
        trace_seq_init(p);
-        ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
+        trace_seq_printf(s, "%s: ", ftrace_event_name(event));
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return 0;
+        return trace_handle_return(s);
 }
 EXPORT_SYMBOL(ftrace_raw_output_prep);
@@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
                             char *fmt, va_list ap)
 {
        struct trace_seq *s = &iter->seq;
-        int ret;
-        ret = trace_seq_printf(s, "%s: ", name);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ret = trace_seq_vprintf(s, fmt, ap);
-        if (!ret)
+        trace_seq_printf(s, "%s: ", name);
-                return TRACE_TYPE_PARTIAL_LINE;
+        trace_seq_vprintf(s, fmt, ap);
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name)
 }
 #endif /* CONFIG_KRETPROBES */
-static int
+static void
 seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
@@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
        name = kretprobed(str);
-        return trace_seq_printf(s, fmt, name);
+        trace_seq_printf(s, fmt, name);
 #endif
-        return 1;
 }
-static int
+static void
 seq_print_sym_offset(struct trace_seq *s, const char *fmt,
                     unsigned long address)
 {
@@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
        sprint_symbol(str, address);
        name = kretprobed(str);
-        return trace_seq_printf(s, fmt, name);
+        trace_seq_printf(s, fmt, name);
 #endif
-        return 1;
 }
 #ifndef CONFIG_64BIT
@@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
                if (file) {
                        ret = trace_seq_path(s, &file->f_path);
                        if (ret)
-                                ret = trace_seq_printf(s, "[+0x%lx]",
+                                trace_seq_printf(s, "[+0x%lx]",
-                                                       ip - vmstart);
+                                                 ip - vmstart);
                }
                up_read(&mm->mmap_sem);
        }
        if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
-                ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+                trace_seq_printf(s, " <" IP_FMT ">", ip);
-        return ret;
+        return !trace_seq_has_overflowed(s);
 }
 int
@@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
                      unsigned long sym_flags)
 {
        struct mm_struct *mm = NULL;
-        int ret = 1;
        unsigned int i;
        if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
        for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
                unsigned long ip = entry->caller[i];
-                if (ip == ULONG_MAX || !ret)
+                if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
                        break;
-                if (ret)
-                        ret = trace_seq_puts(s, " => ");
+                trace_seq_puts(s, " => ");
                if (!ip) {
-                        if (ret)
+                        trace_seq_puts(s, "??");
-                                ret = trace_seq_puts(s, "??");
+                        trace_seq_putc(s, '\n');
-                        if (ret)
-                                ret = trace_seq_putc(s, '\n');
                        continue;
                }
-                if (!ret)
-                        break;
+                seq_print_user_ip(s, mm, ip, sym_flags);
-                if (ret)
+                trace_seq_putc(s, '\n');
-                        ret = seq_print_user_ip(s, mm, ip, sym_flags);
-                ret = trace_seq_putc(s, '\n');
        }
        if (mm)
                mmput(mm);
-        return ret;
+        return !trace_seq_has_overflowed(s);
 }
 int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 {
-        int ret;
+        if (!ip) {
+                trace_seq_putc(s, '0');
-        if (!ip)
+                goto out;
-                return trace_seq_putc(s, '0');
+        }
        if (sym_flags & TRACE_ITER_SYM_OFFSET)
-                ret = seq_print_sym_offset(s, "%s", ip);
+                seq_print_sym_offset(s, "%s", ip);
        else
-                ret = seq_print_sym_short(s, "%s", ip);
+                seq_print_sym_short(s, "%s", ip);
-        if (!ret)
-                return 0;
        if (sym_flags & TRACE_ITER_SYM_ADDR)
-                ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+                trace_seq_printf(s, " <" IP_FMT ">", ip);
-        return ret;
+ out:
+        return !trace_seq_has_overflowed(s);
 }
 /**
@@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
        char irqs_off;
        int hardirq;
        int softirq;
-        int ret;
        hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
        softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
                softirq ? 's' :
                '.';
-        if (!trace_seq_printf(s, "%c%c%c",
+        trace_seq_printf(s, "%c%c%c",
-                              irqs_off, need_resched, hardsoft_irq))
+                         irqs_off, need_resched, hardsoft_irq);
-                return 0;
        if (entry->preempt_count)
-                ret = trace_seq_printf(s, "%x", entry->preempt_count);
+                trace_seq_printf(s, "%x", entry->preempt_count);
        else
-                ret = trace_seq_putc(s, '.');
+                trace_seq_putc(s, '.');
-        return ret;
+        return !trace_seq_has_overflowed(s);
 }
 static int
@@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
        trace_find_cmdline(entry->pid, comm);
-        if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
+        trace_seq_printf(s, "%8.8s-%-5d %3d",
-                              comm, entry->pid, cpu))
+                         comm, entry->pid, cpu);
-                return 0;
        return trace_print_lat_fmt(s, entry);
 }
-static unsigned long preempt_mark_thresh_us = 100;
+#undef MARK
+#define MARK(v, s) {.val = v, .sym = s}
+/* trace overhead mark */
+static const struct trace_mark {
+        unsigned long long      val; /* unit: nsec */
+        char                    sym;
+} mark[] = {
+        MARK(1000000000ULL      , '$'), /* 1 sec */
+        MARK(1000000ULL         , '#'), /* 1000 usecs */
+        MARK(100000ULL          , '!'), /* 100 usecs */
+        MARK(10000ULL           , '+'), /* 10 usecs */
+};
+#undef MARK
+char trace_find_mark(unsigned long long d)
+{
+        int i;
+        int size = ARRAY_SIZE(mark);
+        for (i = 0; i < size; i++) {
+                if (d >= mark[i].val)
+                        break;
+        }
+        return (i == size) ? ' ' : mark[i].sym;
+}
 static int
 lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
                unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
                unsigned long rel_msec = (unsigned long)rel_ts;
-                return trace_seq_printf(
+                trace_seq_printf(
-                                s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
+                        s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
-                                ns2usecs(iter->ts),
+                        ns2usecs(iter->ts),
-                                abs_msec, abs_usec,
+                        abs_msec, abs_usec,
-                                rel_msec, rel_usec);
+                        rel_msec, rel_usec);
        } else if (verbose && !in_ns) {
-                return trace_seq_printf(
+                trace_seq_printf(
-                                s, "[%016llx] %lld (+%lld): ",
+                        s, "[%016llx] %lld (+%lld): ",
-                                iter->ts, abs_ts, rel_ts);
+                        iter->ts, abs_ts, rel_ts);
        } else if (!verbose && in_ns) {
-                return trace_seq_printf(
+                trace_seq_printf(
-                                s, " %4lldus%c: ",
+                        s, " %4lldus%c: ",
-                                abs_ts,
+                        abs_ts,
-                                rel_ts > preempt_mark_thresh_us ? '!' :
+                        trace_find_mark(rel_ts * NSEC_PER_USEC));
-                                  rel_ts > 1 ? '+' : ' ');
        } else { /* !verbose && !in_ns */
-                return trace_seq_printf(s, " %4lld: ", abs_ts);
+                trace_seq_printf(s, " %4lld: ", abs_ts);
        }
+        return !trace_seq_has_overflowed(s);
 }
 int trace_print_context(struct trace_iterator *iter)
@@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter)
        unsigned long long t;
        unsigned long secs, usec_rem;
        char comm[TASK_COMM_LEN];
-        int ret;
        trace_find_cmdline(entry->pid, comm);
-        ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
+        trace_seq_printf(s, "%16s-%-5d [%03d] ",
                               comm, entry->pid, iter->cpu);
-        if (!ret)
-                return 0;
-        if (trace_flags & TRACE_ITER_IRQ_INFO) {
+        if (trace_flags & TRACE_ITER_IRQ_INFO)
-                ret = trace_print_lat_fmt(s, entry);
+                trace_print_lat_fmt(s, entry);
-                if (!ret)
-                        return 0;
-        }
        if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
                t = ns2usecs(iter->ts);
                usec_rem = do_div(t, USEC_PER_SEC);
                secs = (unsigned long)t;
-                return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
+                trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
        } else
-                return trace_seq_printf(s, " %12llu: ", iter->ts);
+                trace_seq_printf(s, " %12llu: ", iter->ts);
+        return !trace_seq_has_overflowed(s);
 }
 int trace_print_lat_context(struct trace_iterator *iter)
 {
        u64 next_ts;
-        int ret;
        /* trace_find_next_entry will reset ent_size */
        int ent_size = iter->ent_size;
        struct trace_seq *s = &iter->seq;
@@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
                trace_find_cmdline(entry->pid, comm);
-                ret = trace_seq_printf(
+                trace_seq_printf(
-                                s, "%16s %5d %3d %d %08x %08lx ",
+                        s, "%16s %5d %3d %d %08x %08lx ",
-                                comm, entry->pid, iter->cpu, entry->flags,
+                        comm, entry->pid, iter->cpu, entry->flags,
-                                entry->preempt_count, iter->idx);
+                        entry->preempt_count, iter->idx);
        } else {
-                ret = lat_print_generic(s, entry, iter->cpu);
+                lat_print_generic(s, entry, iter->cpu);
        }
-        if (ret)
+        lat_print_timestamp(iter, next_ts);
-                ret = lat_print_timestamp(iter, next_ts);
-        return ret;
+        return !trace_seq_has_overflowed(s);
 }
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event)
                                goto out;
                } else {
-                        
                        event->type = next_event_type++;
                        list = &ftrace_event_list;
                }
@@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
                                  struct trace_event *event)
 {
-        if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
+        trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(&iter->seq);
 }
 /* TRACE_FN */
@@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
        trace_assign_type(field, iter->ent);
-        if (!seq_print_ip_sym(s, field->ip, flags))
+        seq_print_ip_sym(s, field->ip, flags);
-                goto partial;
        if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
-                if (!trace_seq_puts(s, " <-"))
+                trace_seq_puts(s, " <-");
-                        goto partial;
+                seq_print_ip_sym(s, field->parent_ip, flags);
-                if (!seq_print_ip_sym(s,
-                                      field->parent_ip,
-                                      flags))
-                        goto partial;
        }
-        if (!trace_seq_putc(s, '\n'))
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+        trace_seq_putc(s, '\n');
- partial:
+        return trace_handle_return(s);
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
        trace_assign_type(field, iter->ent);
-        if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
+        trace_seq_printf(&iter->seq, "%lx %lx\n",
-                              field->ip,
+                         field->ip,
-                              field->parent_ip))
+                         field->parent_ip);
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(&iter->seq);
 }
 static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
        trace_assign_type(field, iter->ent);
-        SEQ_PUT_HEX_FIELD_RET(s, field->ip);
+        SEQ_PUT_HEX_FIELD(s, field->ip);
-        SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+        SEQ_PUT_HEX_FIELD(s, field->parent_ip);
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
        trace_assign_type(field, iter->ent);
-        SEQ_PUT_FIELD_RET(s, field->ip);
+        SEQ_PUT_FIELD(s, field->ip);
-        SEQ_PUT_FIELD_RET(s, field->parent_ip);
+        SEQ_PUT_FIELD(s, field->parent_ip);
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static struct trace_event_functions trace_fn_funcs = {
@@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
        T = task_state_char(field->next_state);
        S = task_state_char(field->prev_state);
        trace_find_cmdline(field->next_pid, comm);
-        if (!trace_seq_printf(&iter->seq,
+        trace_seq_printf(&iter->seq,
-                              " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+                         " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
-                              field->prev_pid,
+                         field->prev_pid,
-                              field->prev_prio,
+                         field->prev_prio,
-                              S, delim,
+                         S, delim,
-                              field->next_cpu,
+                         field->next_cpu,
-                              field->next_pid,
+                         field->next_pid,
-                              field->next_prio,
+                         field->next_prio,
-                              T, comm))
+                         T, comm);
-                return TRACE_TYPE_PARTIAL_LINE;
+        return trace_handle_return(&iter->seq);
-        return TRACE_TYPE_HANDLED;
 }
 static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
        if (!S)
                S = task_state_char(field->prev_state);
        T = task_state_char(field->next_state);
-        if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
+        trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
-                              field->prev_pid,
+                         field->prev_pid,
-                              field->prev_prio,
+                         field->prev_prio,
-                              S,
+                         S,
-                              field->next_cpu,
+                         field->next_cpu,
-                              field->next_pid,
+                         field->next_pid,
-                              field->next_prio,
+                         field->next_prio,
-                              T))
+                         T);
-                return TRACE_TYPE_PARTIAL_LINE;
+        return trace_handle_return(&iter->seq);
-        return TRACE_TYPE_HANDLED;
 }
 static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
                S = task_state_char(field->prev_state);
        T = task_state_char(field->next_state);
-        SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
+        SEQ_PUT_HEX_FIELD(s, field->prev_pid);
-        SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
+        SEQ_PUT_HEX_FIELD(s, field->prev_prio);
-        SEQ_PUT_HEX_FIELD_RET(s, S);
+        SEQ_PUT_HEX_FIELD(s, S);
-        SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
+        SEQ_PUT_HEX_FIELD(s, field->next_cpu);
-        SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
+        SEQ_PUT_HEX_FIELD(s, field->next_pid);
-        SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
+        SEQ_PUT_HEX_FIELD(s, field->next_prio);
-        SEQ_PUT_HEX_FIELD_RET(s, T);
+        SEQ_PUT_HEX_FIELD(s, T);
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
        trace_assign_type(field, iter->ent);
-        SEQ_PUT_FIELD_RET(s, field->prev_pid);
+        SEQ_PUT_FIELD(s, field->prev_pid);
-        SEQ_PUT_FIELD_RET(s, field->prev_prio);
+        SEQ_PUT_FIELD(s, field->prev_prio);
-        SEQ_PUT_FIELD_RET(s, field->prev_state);
+        SEQ_PUT_FIELD(s, field->prev_state);
-        SEQ_PUT_FIELD_RET(s, field->next_pid);
+        SEQ_PUT_FIELD(s, field->next_cpu);
-        SEQ_PUT_FIELD_RET(s, field->next_prio);
+        SEQ_PUT_FIELD(s, field->next_pid);
-        SEQ_PUT_FIELD_RET(s, field->next_state);
+        SEQ_PUT_FIELD(s, field->next_prio);
+        SEQ_PUT_FIELD(s, field->next_state);
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static struct trace_event_functions trace_ctx_funcs = {
@@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
        trace_assign_type(field, iter->ent);
        end = (unsigned long *)((long)iter->ent + iter->ent_size);
-        if (!trace_seq_puts(s, "<stack trace>\n"))
+        trace_seq_puts(s, "<stack trace>\n");
-                goto partial;
        for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
-                if (!trace_seq_puts(s, " => "))
-                        goto partial;
-                if (!seq_print_ip_sym(s, *p, flags))
+                if (trace_seq_has_overflowed(s))
-                        goto partial;
+                        break;
-                if (!trace_seq_putc(s, '\n'))
-                        goto partial;
-        }
-        return TRACE_TYPE_HANDLED;
+                trace_seq_puts(s, " => ");
+                seq_print_ip_sym(s, *p, flags);
+                trace_seq_putc(s, '\n');
+        }
- partial:
+        return trace_handle_return(s);
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static struct trace_event_functions trace_stack_funcs = {
@@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
        trace_assign_type(field, iter->ent);
-        if (!trace_seq_puts(s, "<user stack trace>\n"))
+        trace_seq_puts(s, "<user stack trace>\n");
-                goto partial;
+        seq_print_userip_objs(field, s, flags);
-        if (!seq_print_userip_objs(field, s, flags))
-                goto partial;
-        return TRACE_TYPE_HANDLED;
- partial:
+        return trace_handle_return(s);
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static struct trace_event_functions trace_user_stack_funcs = {
@@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
        trace_assign_type(field, entry);
-        if (!seq_print_ip_sym(s, field->ip, flags))
+        seq_print_ip_sym(s, field->ip, flags);
-                goto partial;
+        trace_seq_puts(s, ": ");
+        trace_seq_puts(s, field->str);
-        if (!trace_seq_puts(s, ": "))
+        return trace_handle_return(s);
-                goto partial;
-        if (!trace_seq_puts(s, field->str))
-                goto partial;
-        return TRACE_TYPE_HANDLED;
- partial:
-        return TRACE_TYPE_PARTIAL_LINE;
 }
@@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
        trace_assign_type(field, iter->ent);
-        if (!trace_seq_printf(s, ": %lx : ", field->ip))
+        trace_seq_printf(s, ": %lx : ", field->ip);
-                goto partial;
+        trace_seq_puts(s, field->str);
-        if (!trace_seq_puts(s, field->str))
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
- partial:
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static struct trace_event_functions trace_bputs_funcs = {
@@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
        trace_assign_type(field, entry);
-        if (!seq_print_ip_sym(s, field->ip, flags))
+        seq_print_ip_sym(s, field->ip, flags);
-                goto partial;
+        trace_seq_puts(s, ": ");
+        trace_seq_bprintf(s, field->fmt, field->buf);
-        if (!trace_seq_puts(s, ": "))
-                goto partial;
-        if (!trace_seq_bprintf(s, field->fmt, field->buf))
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
- partial:
-        return TRACE_TYPE_PARTIAL_LINE;
 }
@@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
        trace_assign_type(field, iter->ent);
-        if (!trace_seq_printf(s, ": %lx : ", field->ip))
+        trace_seq_printf(s, ": %lx : ", field->ip);
-                goto partial;
+        trace_seq_bprintf(s, field->fmt, field->buf);
-        if (!trace_seq_bprintf(s, field->fmt, field->buf))
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
- partial:
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static struct trace_event_functions trace_bprint_funcs = {
@@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
        trace_assign_type(field, iter->ent);
-        if (!seq_print_ip_sym(s, field->ip, flags))
+        seq_print_ip_sym(s, field->ip, flags);
-                goto partial;
+        trace_seq_printf(s, ": %s", field->buf);
-        if (!trace_seq_printf(s, ": %s", field->buf))
-                goto partial;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
- partial:
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
        trace_assign_type(field, iter->ent);
-        if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
+        trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
-                goto partial;
-        return TRACE_TYPE_HANDLED;
- partial:
+        return trace_handle_return(&iter->seq);
-        return TRACE_TYPE_PARTIAL_LINE;
 }
 static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 80b25b585a70..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 extern int __unregister_ftrace_event(struct trace_event *event);
 extern struct rw_semaphore trace_event_sem;
-#define SEQ_PUT_FIELD_RET(s, x)                         \
+#define SEQ_PUT_FIELD(s, x)                             \
-do {                                                    \
+        trace_seq_putmem(s, &(x), sizeof(x))
-        if (!trace_seq_putmem(s, &(x), sizeof(x)))      \
-                return TRACE_TYPE_PARTIAL_LINE;         \
+#define SEQ_PUT_HEX_FIELD(s, x)                         \
-} while (0)
+        trace_seq_putmem_hex(s, &(x), sizeof(x))
-#define SEQ_PUT_HEX_FIELD_RET(s, x)                     \
-do {                                                    \
-        if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))  \
-                return TRACE_TYPE_PARTIAL_LINE;         \
-} while (0)
 #endif
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..c4e70b6bd7fa 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v)
                        seq_puts(m, "\\t");
                        break;
                case '\\':
-                        seq_puts(m, "\\");
+                        seq_putc(m, '\\');
                        break;
                case '"':
                        seq_puts(m, "\\\"");
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..b983b2fd2ca1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
 int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name,   \
                                void *data, void *ent)                  \
 {                                                                       \
-        return trace_seq_printf(s, " %s=" fmt, name, *(type *)data);    \
+        trace_seq_printf(s, " %s=" fmt, name, *(type *)data);           \
+        return !trace_seq_has_overflowed(s);                            \
 }                                                                       \
 const char PRINT_TYPE_FMT_NAME(type)[] = fmt;                           \
 NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
        int len = *(u32 *)data >> 16;
        if (!len)
-                return trace_seq_printf(s, " %s=(fault)", name);
+                trace_seq_printf(s, " %s=(fault)", name);
        else
-                return trace_seq_printf(s, " %s=\"%s\"", name,
+                trace_seq_printf(s, " %s=\"%s\"", name,
-                                        (const char *)get_loc_data(data, ent));
+                                 (const char *)get_loc_data(data, ent));
+        return !trace_seq_has_overflowed(s);
 }
 NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..2e293beb186e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -14,122 +14,26 @@
 #include "trace.h"
-static struct trace_array       *ctx_trace;
-static int __read_mostly        tracer_enabled;
 static int                      sched_ref;
 static DEFINE_MUTEX(sched_register_mutex);
-static int                      sched_stopped;
-void
-tracing_sched_switch_trace(struct trace_array *tr,
-                           struct task_struct *prev,
-                           struct task_struct *next,
-                           unsigned long flags, int pc)
-{
-        struct ftrace_event_call *call = &event_context_switch;
-        struct ring_buffer *buffer = tr->trace_buffer.buffer;
-        struct ring_buffer_event *event;
-        struct ctx_switch_entry *entry;
-        event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
-                                          sizeof(*entry), flags, pc);
-        if (!event)
-                return;
-        entry   = ring_buffer_event_data(event);
-        entry->prev_pid                 = prev->pid;
-        entry->prev_prio                = prev->prio;
-        entry->prev_state               = prev->state;
-        entry->next_pid                 = next->pid;
-        entry->next_prio                = next->prio;
-        entry->next_state               = next->state;
-        entry->next_cpu = task_cpu(next);
-        if (!call_filter_check_discard(call, entry, buffer, event))
-                trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
 static void
 probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
 {
-        struct trace_array_cpu *data;
-        unsigned long flags;
-        int cpu;
-        int pc;
        if (unlikely(!sched_ref))
                return;
        tracing_record_cmdline(prev);
        tracing_record_cmdline(next);
-        if (!tracer_enabled || sched_stopped)
-                return;
-        pc = preempt_count();
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-        if (likely(!atomic_read(&data->disabled)))
-                tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
-        local_irq_restore(flags);
-}
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
-                           struct task_struct *wakee,
-                           struct task_struct *curr,
-                           unsigned long flags, int pc)
-{
-        struct ftrace_event_call *call = &event_wakeup;
-        struct ring_buffer_event *event;
-        struct ctx_switch_entry *entry;
-        struct ring_buffer *buffer = tr->trace_buffer.buffer;
-        event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
-                                          sizeof(*entry), flags, pc);
-        if (!event)
-                return;
-        entry   = ring_buffer_event_data(event);
-        entry->prev_pid                 = curr->pid;
-        entry->prev_prio                = curr->prio;
-        entry->prev_state               = curr->state;
-        entry->next_pid                 = wakee->pid;
-        entry->next_prio                = wakee->prio;
-        entry->next_state               = wakee->state;
-        entry->next_cpu                 = task_cpu(wakee);
-        if (!call_filter_check_discard(call, entry, buffer, event))
-                trace_buffer_unlock_commit(buffer, event, flags, pc);
 }
 static void
 probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
 {
-        struct trace_array_cpu *data;
-        unsigned long flags;
-        int cpu, pc;
        if (unlikely(!sched_ref))
                return;
        tracing_record_cmdline(current);
-        if (!tracer_enabled || sched_stopped)
-                return;
-        pc = preempt_count();
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-        if (likely(!atomic_read(&data->disabled)))
-                tracing_sched_wakeup_trace(ctx_trace, wakee, current,
-                                           flags, pc);
-        local_irq_restore(flags);
 }
 static int tracing_sched_register(void)
@@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void)
 {
        tracing_stop_sched_switch();
 }
-/**
- * tracing_start_sched_switch_record - start tracing context switches
- *
- * Turns on context switch tracing for a tracer.
- */
-void tracing_start_sched_switch_record(void)
-{
-        if (unlikely(!ctx_trace)) {
-                WARN_ON(1);
-                return;
-        }
-        tracing_start_sched_switch();
-        mutex_lock(&sched_register_mutex);
-        tracer_enabled++;
-        mutex_unlock(&sched_register_mutex);
-}
-/**
- * tracing_stop_sched_switch_record - start tracing context switches
- *
- * Turns off context switch tracing for a tracer.
- */
-void tracing_stop_sched_switch_record(void)
-{
-        mutex_lock(&sched_register_mutex);
-        tracer_enabled--;
-        WARN_ON(tracer_enabled < 0);
-        mutex_unlock(&sched_register_mutex);
-        tracing_stop_sched_switch();
-}
-/**
- * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
- * @tr: trace array pointer to assign
- *
- * Some tracers might want to record the context switches in their
- * trace. This function lets those tracers assign the trace array
- * to use.
- */
-void tracing_sched_switch_assign_trace(struct trace_array *tr)
-{
-        ctx_trace = tr;
-}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..8fb84b362816 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
        wakeup_current_cpu = cpu;
 }
+static void
+tracing_sched_switch_trace(struct trace_array *tr,
+                           struct task_struct *prev,
+                           struct task_struct *next,
+                           unsigned long flags, int pc)
+{
+        struct ftrace_event_call *call = &event_context_switch;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
+        struct ring_buffer_event *event;
+        struct ctx_switch_entry *entry;
+        event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
+                                          sizeof(*entry), flags, pc);
+        if (!event)
+                return;
+        entry   = ring_buffer_event_data(event);
+        entry->prev_pid                 = prev->pid;
+        entry->prev_prio                = prev->prio;
+        entry->prev_state               = prev->state;
+        entry->next_pid                 = next->pid;
+        entry->next_prio                = next->prio;
+        entry->next_state               = next->state;
+        entry->next_cpu = task_cpu(next);
+        if (!call_filter_check_discard(call, entry, buffer, event))
+                trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+static void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+                           struct task_struct *wakee,
+                           struct task_struct *curr,
+                           unsigned long flags, int pc)
+{
+        struct ftrace_event_call *call = &event_wakeup;
+        struct ring_buffer_event *event;
+        struct ctx_switch_entry *entry;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
+        event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
+                                          sizeof(*entry), flags, pc);
+        if (!event)
+                return;
+        entry   = ring_buffer_event_data(event);
+        entry->prev_pid                 = curr->pid;
+        entry->prev_prio                = curr->prio;
+        entry->prev_state               = curr->state;
+        entry->next_pid                 = wakee->pid;
+        entry->next_prio                = wakee->prio;
+        entry->next_state               = wakee->state;
+        entry->next_cpu                 = task_cpu(wakee);
+        if (!call_filter_check_discard(call, entry, buffer, event))
+                trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
 static void notrace
 probe_wakeup_sched_switch(void *ignore,
                          struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1f24ed99dca2..f8b45d8792f9 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -27,10 +27,19 @@
 #include <linux/trace_seq.h>
 /* How much buffer is left on the trace_seq? */
-#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
+#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
 /* How much buffer is written? */
-#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
+#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
+/*
+ * trace_seq should work with being initialized with 0s.
+ */
+static inline void __trace_seq_init(struct trace_seq *s)
+{
+        if (unlikely(!s->seq.size))
+                trace_seq_init(s);
+}
 /**
 * trace_print_seq - move the contents of trace_seq into a seq_file
@@ -43,10 +52,11 @@
 */
 int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
-        unsigned int len = TRACE_SEQ_BUF_USED(s);
        int ret;
-        ret = seq_write(m, s->buffer, len);
+        __trace_seq_init(s);
+        ret = seq_buf_print_seq(m, &s->seq);
        /*
         * Only reset this buffer if we successfully wrote to the
@@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 * trace_seq_printf() is used to store strings into a special
 * buffer (@s). Then the output may be either used by
 * the sequencer or pulled into another buffer.
- *
- * Returns 1 if we successfully written all the contents to
- *   the buffer.
-  * Returns 0 if we the length to write is bigger than the
- *   reserved buffer space. In this case, nothing gets written.
 */
-int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 {
-        unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+        unsigned int save_len = s->seq.len;
        va_list ap;
-        int ret;
-        if (s->full || !len)
+        if (s->full)
-                return 0;
+                return;
+        __trace_seq_init(s);
        va_start(ap, fmt);
-        ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+        seq_buf_vprintf(&s->seq, fmt, ap);
        va_end(ap);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len) {
+        if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+                s->seq.len = save_len;
                s->full = 1;
-                return 0;
        }
-        s->len += ret;
-        return 1;
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
@@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
 * @nmaskbits:  The number of bits that are valid in @maskp
 *
 * Writes a ASCII representation of a bitmask string into @s.
- *
- * Returns 1 if we successfully written all the contents to
- *   the buffer.
- * Returns 0 if we the length to write is bigger than the
- *   reserved buffer space. In this case, nothing gets written.
 */
-int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
                      int nmaskbits)
 {
-        unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+        unsigned int save_len = s->seq.len;
-        int ret;
-        if (s->full || !len)
+        if (s->full)
-                return 0;
+                return;
-        ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
+        __trace_seq_init(s);
-        s->len += ret;
-        return 1;
+        seq_buf_bitmask(&s->seq, maskp, nmaskbits);
+        if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+                s->seq.len = save_len;
+                s->full = 1;
+        }
 }
 EXPORT_SYMBOL_GPL(trace_seq_bitmask);
@@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
 * trace_seq_printf is used to store strings into a special
 * buffer (@s). Then the output may be either used by
 * the sequencer or pulled into another buffer.
- *
- * Returns how much it wrote to the buffer.
 */
-int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
 {
-        unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+        unsigned int save_len = s->seq.len;
-        int ret;
-        if (s->full || !len)
+        if (s->full)
-                return 0;
+                return;
-        ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+        __trace_seq_init(s);
+        seq_buf_vprintf(&s->seq, fmt, args);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len) {
+        if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+                s->seq.len = save_len;
                s->full = 1;
-                return 0;
        }
-        s->len += ret;
-        return len;
 }
 EXPORT_SYMBOL_GPL(trace_seq_vprintf);
@@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
 *
 * This function will take the format and the binary array and finish
 * the conversion into the ASCII string within the buffer.
- *
- * Returns how much it wrote to the buffer.
 */
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
-        unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+        unsigned int save_len = s->seq.len;
-        int ret;
-        if (s->full || !len)
+        if (s->full)
-                return 0;
+                return;
+        __trace_seq_init(s);
-        ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+        seq_buf_bprintf(&s->seq, fmt, binary);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len) {
+        if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+                s->seq.len = save_len;
                s->full = 1;
-                return 0;
+                return;
        }
-        s->len += ret;
-        return len;
 }
 EXPORT_SYMBOL_GPL(trace_seq_bprintf);
@@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf);
 * copy to user routines. This function records a simple string
 * into a special buffer (@s) for later retrieval by a sequencer
 * or other mechanism.
- *
- * Returns how much it wrote to the buffer.
 */
-int trace_seq_puts(struct trace_seq *s, const char *str)
+void trace_seq_puts(struct trace_seq *s, const char *str)
 {
        unsigned int len = strlen(str);
        if (s->full)
-                return 0;
+                return;
+        __trace_seq_init(s);
        if (len > TRACE_SEQ_BUF_LEFT(s)) {
                s->full = 1;
-                return 0;
+                return;
        }
-        memcpy(s->buffer + s->len, str, len);
+        seq_buf_putmem(&s->seq, str, len);
-        s->len += len;
-        return len;
 }
 EXPORT_SYMBOL_GPL(trace_seq_puts);
@@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
 * copy to user routines. This function records a simple charater
 * into a special buffer (@s) for later retrieval by a sequencer
 * or other mechanism.
- *
- * Returns how much it wrote to the buffer.
 */
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
+void trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
        if (s->full)
-                return 0;
+                return;
+        __trace_seq_init(s);
        if (TRACE_SEQ_BUF_LEFT(s) < 1) {
                s->full = 1;
-                return 0;
+                return;
        }
-        s->buffer[s->len++] = c;
+        seq_buf_putc(&s->seq, c);
-        return 1;
 }
 EXPORT_SYMBOL_GPL(trace_seq_putc);
@@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc);
 * There may be cases where raw memory needs to be written into the
 * buffer and a strcpy() would not work. Using this function allows
 * for such cases.
- *
- * Returns how much it wrote to the buffer.
 */
-int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
 {
        if (s->full)
-                return 0;
+                return;
+        __trace_seq_init(s);
        if (len > TRACE_SEQ_BUF_LEFT(s)) {
                s->full = 1;
-                return 0;
+                return;
        }
-        memcpy(s->buffer + s->len, mem, len);
+        seq_buf_putmem(&s->seq, mem, len);
-        s->len += len;
-        return len;
 }
 EXPORT_SYMBOL_GPL(trace_seq_putmem);
-#define MAX_MEMHEX_BYTES        8U
-#define HEX_CHARS               (MAX_MEMHEX_BYTES*2 + 1)
 /**
 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
 * @s: trace sequence descriptor
@@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
 * This is similar to trace_seq_putmem() except instead of just copying the
 * raw memory into the buffer it writes its ASCII representation of it
 * in hex characters.
- *
- * Returns how much it wrote to the buffer.
 */
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
                         unsigned int len)
 {
-        unsigned char hex[HEX_CHARS];
+        unsigned int save_len = s->seq.len;
-        const unsigned char *data = mem;
-        unsigned int start_len;
-        int i, j;
-        int cnt = 0;
        if (s->full)
-                return 0;
+                return;
-        while (len) {
+        __trace_seq_init(s);
-                start_len = min(len, HEX_CHARS - 1);
-#ifdef __BIG_ENDIAN
+        /* Each byte is represented by two chars */
-                for (i = 0, j = 0; i < start_len; i++) {
+        if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
-#else
+                s->full = 1;
-                for (i = start_len-1, j = 0; i >= 0; i--) {
+                return;
-#endif
+        }
-                        hex[j++] = hex_asc_hi(data[i]);
-                        hex[j++] = hex_asc_lo(data[i]);
+        /* The added spaces can still cause an overflow */
-                }
+        seq_buf_putmem_hex(&s->seq, mem, len);
-                if (WARN_ON_ONCE(j == 0 || j/2 > len))
-                        break;
+        if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+                s->seq.len = save_len;
-                /* j increments twice per loop */
+                s->full = 1;
-                len -= j / 2;
+                return;
-                hex[j++] = ' ';
-                cnt += trace_seq_putmem(s, hex, j);
        }
-        return cnt;
 }
 EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
@@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
 */
 int trace_seq_path(struct trace_seq *s, const struct path *path)
 {
-        unsigned char *p;
+        unsigned int save_len = s->seq.len;
        if (s->full)
                return 0;
+        __trace_seq_init(s);
        if (TRACE_SEQ_BUF_LEFT(s) < 1) {
                s->full = 1;
                return 0;
        }
-        p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+        seq_buf_path(&s->seq, path, "\n");
-        if (!IS_ERR(p)) {
-                p = mangle_path(s->buffer + s->len, p, "\n");
+        if (unlikely(seq_buf_has_overflowed(&s->seq))) {
-                if (p) {
+                s->seq.len = save_len;
-                        s->len = p - s->buffer;
+                s->full = 1;
-                        return 1;
+                return 0;
-                }
-        } else {
-                s->buffer[s->len++] = '?';
-                return 1;
        }
-        s->full = 1;
+        return 1;
-        return 0;
 }
 EXPORT_SYMBOL_GPL(trace_seq_path);
@@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
 */
 int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
 {
-        int len;
+        __trace_seq_init(s);
-        int ret;
+        return seq_buf_to_user(&s->seq, ubuf, cnt);
-        if (!cnt)
-                return 0;
-        if (s->len <= s->readpos)
-                return -EBUSY;
-        len = s->len - s->readpos;
-        if (cnt > len)
-                cnt = len;
-        ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
-        if (ret == cnt)
-                return -EFAULT;
-        cnt -= ret;
-        s->readpos += cnt;
-        return cnt;
 }
 EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 29228c4d5696..c6ee36fcbf90 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
        struct trace_entry *ent = iter->ent;
        struct syscall_trace_enter *trace;
        struct syscall_metadata *entry;
-        int i, ret, syscall;
+        int i, syscall;
        trace = (typeof(trace))ent;
        syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
                goto end;
        }
-        ret = trace_seq_printf(s, "%s(", entry->name);
+        trace_seq_printf(s, "%s(", entry->name);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
        for (i = 0; i < entry->nb_args; i++) {
+                if (trace_seq_has_overflowed(s))
+                        goto end;
                /* parameter types */
-                if (trace_flags & TRACE_ITER_VERBOSE) {
+                if (trace_flags & TRACE_ITER_VERBOSE)
-                        ret = trace_seq_printf(s, "%s ", entry->types[i]);
+                        trace_seq_printf(s, "%s ", entry->types[i]);
-                        if (!ret)
-                                return TRACE_TYPE_PARTIAL_LINE;
-                }
                /* parameter values */
-                ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
+                trace_seq_printf(s, "%s: %lx%s", entry->args[i],
-                                       trace->args[i],
+                                 trace->args[i],
-                                       i == entry->nb_args - 1 ? "" : ", ");
+                                 i == entry->nb_args - 1 ? "" : ", ");
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
        }
-        ret = trace_seq_putc(s, ')');
+        trace_seq_putc(s, ')');
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
 end:
-        ret =  trace_seq_putc(s, '\n');
+        trace_seq_putc(s, '\n');
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return trace_handle_return(s);
 }
 static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
        struct syscall_trace_exit *trace;
        int syscall;
        struct syscall_metadata *entry;
-        int ret;
        trace = (typeof(trace))ent;
        syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
        if (!entry) {
                trace_seq_putc(s, '\n');
-                return TRACE_TYPE_HANDLED;
+                goto out;
        }
        if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
                return TRACE_TYPE_UNHANDLED;
        }
-        ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+        trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
                                trace->ret);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+ out:
+        return trace_handle_return(s);
 }
 extern char *__bad_type_size(void);
@@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
        return (unsigned long)sys_call_table[nr];
 }
-static int __init init_ftrace_syscalls(void)
+void __init init_ftrace_syscalls(void)
 {
        struct syscall_metadata *meta;
        unsigned long addr;
@@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void)
                                    GFP_KERNEL);
        if (!syscalls_metadata) {
                WARN_ON(1);
-                return -ENOMEM;
+                return;
        }
        for (i = 0; i < NR_syscalls; i++) {
@@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void)
                meta->syscall_nr = i;
                syscalls_metadata[i] = meta;
        }
-        return 0;
 }
-early_initcall(init_ftrace_syscalls);
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 33ff6a24b802..8520acc34b18 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -552,8 +552,7 @@ error:
        return ret;
 fail_address_parse:
-        if (inode)
+        iput(inode);
-                iput(inode);
        pr_info("Failed to parse address or file.\n");
@@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
        for (i = 0; i < tu->tp.nr_args; i++)
                seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
        return 0;
 }
@@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
        tu = container_of(event, struct trace_uprobe, tp.call.event);
        if (is_ret_probe(tu)) {
-                if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+                trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
-                                        ftrace_event_name(&tu->tp.call),
+                                 ftrace_event_name(&tu->tp.call),
-                                        entry->vaddr[1], entry->vaddr[0]))
+                                 entry->vaddr[1], entry->vaddr[0]);
-                        goto partial;
                data = DATAOF_TRACE_ENTRY(entry, true);
        } else {
-                if (!trace_seq_printf(s, "%s: (0x%lx)",
+                trace_seq_printf(s, "%s: (0x%lx)",
-                                        ftrace_event_name(&tu->tp.call),
+                                 ftrace_event_name(&tu->tp.call),
-                                        entry->vaddr[0]))
+                                 entry->vaddr[0]);
-                        goto partial;
                data = DATAOF_TRACE_ENTRY(entry, false);
        }
@@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
                struct probe_arg *parg = &tu->tp.args[i];
                if (!parg->type->print(s, parg->name, data + parg->offset, entry))
-                        goto partial;
+                        goto out;
        }
-        if (trace_seq_puts(s, "\n"))
+        trace_seq_putc(s, '\n');
-                return TRACE_TYPE_HANDLED;
-partial:
+ out:
-        return TRACE_TYPE_PARTIAL_LINE;
+        return trace_handle_return(s);
 }
 typedef bool (*filter_func_t)(struct uprobe_consumer *self,
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!ns_capable(current_user_ns(), CAP_SETGID))
+        if (!may_setgroups())
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 4efa39350e44..b069ccbfb0b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,7 +50,11 @@ struct user_namespace init_user_ns = {
        .count = ATOMIC_INIT(3),
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
-        .proc_inum = PROC_USER_INIT_INO,
+        .ns.inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_USER_NS
+        .ns.ops = &userns_operations,
+#endif
+        .flags = USERNS_INIT_FLAGS,
 #ifdef CONFIG_PERSISTENT_KEYRINGS
        .persistent_keyring_register_sem =
        __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa312b0dc3ec..4109f8320684 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
 #include <linux/fs_struct.h>
 static struct kmem_cache *user_ns_cachep __read_mostly;
+static DEFINE_MUTEX(userns_state_mutex);
 static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
@@ -86,11 +87,12 @@ int create_user_ns(struct cred *new)
        if (!ns)
                return -ENOMEM;
-        ret = proc_alloc_inum(&ns->proc_inum);
+        ret = ns_alloc_inum(&ns->ns);
        if (ret) {
                kmem_cache_free(user_ns_cachep, ns);
                return ret;
        }
+        ns->ns.ops = &userns_operations;
        atomic_set(&ns->count, 1);
        /* Leave the new->user_ns reference with the new user namespace. */
@@ -99,6 +101,11 @@ int create_user_ns(struct cred *new)
        ns->owner = owner;
        ns->group = group;
+        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
+        mutex_lock(&userns_state_mutex);
+        ns->flags = parent_ns->flags;
+        mutex_unlock(&userns_state_mutex);
        set_cred_user_ns(new, ns);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns)
 #ifdef CONFIG_PERSISTENT_KEYRINGS
                key_put(ns->persistent_keyring_register);
 #endif
-                proc_free_inum(ns->proc_inum);
+                ns_free_inum(&ns->ns);
                kmem_cache_free(user_ns_cachep, ns);
                ns = parent;
        } while (atomic_dec_and_test(&parent->count));
@@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
        return false;
 }
-static DEFINE_MUTEX(id_map_mutex);
 static ssize_t map_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos,
                         int cap_setid,
@@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        ssize_t ret = -EINVAL;
        /*
-         * The id_map_mutex serializes all writes to any given map.
+         * The userns_state_mutex serializes all writes to any given map.
         *
         * Any map is only ever written once.
         *
@@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
         * order and smp_rmb() is guaranteed that we don't have crazy
         * architectures returning stale data.
         */
-        mutex_lock(&id_map_mutex);
+        mutex_lock(&userns_state_mutex);
        ret = -EPERM;
        /* Only allow one successful write to the map */
@@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        if (!page)
                goto out;
-        /* Only allow <= page size writes at the beginning of the file */
+        /* Only allow < page size writes at the beginning of the file */
        ret = -EINVAL;
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                goto out;
@@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        *ppos = count;
        ret = count;
 out:
-        mutex_unlock(&id_map_mutex);
+        mutex_unlock(&userns_state_mutex);
        if (page)
                free_page(page);
        return ret;
@@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
 {
-        /* Allow mapping to your own filesystem ids */
+        const struct cred *cred = file->f_cred;
-        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+        /* Don't allow mappings that would allow anything that wouldn't
+         * be allowed without the establishment of unprivileged mappings.
+         */
+        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
+            uid_eq(ns->owner, cred->euid)) {
                u32 id = new_map->extent[0].lower_first;
                if (cap_setid == CAP_SETUID) {
                        kuid_t uid = make_kuid(ns->parent, id);
-                        if (uid_eq(uid, file->f_cred->fsuid))
+                        if (uid_eq(uid, cred->euid))
                                return true;
                } else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
-                        if (gid_eq(gid, file->f_cred->fsgid))
+                        if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
+                            gid_eq(gid, cred->egid))
                                return true;
                }
        }
@@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file,
        return false;
 }
-static void *userns_get(struct task_struct *task)
+int proc_setgroups_show(struct seq_file *seq, void *v)
+{
+        struct user_namespace *ns = seq->private;
+        unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+        seq_printf(seq, "%s\n",
+                   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
+                   "allow" : "deny");
+        return 0;
+}
+ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+        struct seq_file *seq = file->private_data;
+        struct user_namespace *ns = seq->private;
+        char kbuf[8], *pos;
+        bool setgroups_allowed;
+        ssize_t ret;
+        /* Only allow a very narrow range of strings to be written */
+        ret = -EINVAL;
+        if ((*ppos != 0) || (count >= sizeof(kbuf)))
+                goto out;
+        /* What was written? */
+        ret = -EFAULT;
+        if (copy_from_user(kbuf, buf, count))
+                goto out;
+        kbuf[count] = '\0';
+        pos = kbuf;
+        /* What is being requested? */
+        ret = -EINVAL;
+        if (strncmp(pos, "allow", 5) == 0) {
+                pos += 5;
+                setgroups_allowed = true;
+        }
+        else if (strncmp(pos, "deny", 4) == 0) {
+                pos += 4;
+                setgroups_allowed = false;
+        }
+        else
+                goto out;
+        /* Verify there is not trailing junk on the line */
+        pos = skip_spaces(pos);
+        if (*pos != '\0')
+                goto out;
+        ret = -EPERM;
+        mutex_lock(&userns_state_mutex);
+        if (setgroups_allowed) {
+                /* Enabling setgroups after setgroups has been disabled
+                 * is not allowed.
+                 */
+                if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
+                        goto out_unlock;
+        } else {
+                /* Permanently disabling setgroups after setgroups has
+                 * been enabled by writing the gid_map is not allowed.
+                 */
+                if (ns->gid_map.nr_extents != 0)
+                        goto out_unlock;
+                ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
+        }
+        mutex_unlock(&userns_state_mutex);
+        /* Report a successful write */
+        *ppos = count;
+        ret = count;
+out:
+        return ret;
+out_unlock:
+        mutex_unlock(&userns_state_mutex);
+        goto out;
+}
+bool userns_may_setgroups(const struct user_namespace *ns)
+{
+        bool allowed;
+        mutex_lock(&userns_state_mutex);
+        /* It is not safe to use setgroups until a gid mapping in
+         * the user namespace has been established.
+         */
+        allowed = ns->gid_map.nr_extents != 0;
+        /* Is setgroups allowed? */
+        allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
+        mutex_unlock(&userns_state_mutex);
+        return allowed;
+}
+static inline struct user_namespace *to_user_ns(struct ns_common *ns)
+{
+        return container_of(ns, struct user_namespace, ns);
+}
+static struct ns_common *userns_get(struct task_struct *task)
 {
        struct user_namespace *user_ns;
@@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task)
        user_ns = get_user_ns(__task_cred(task)->user_ns);
        rcu_read_unlock();
-        return user_ns;
+        return user_ns ? &user_ns->ns : NULL;
 }
-static void userns_put(void *ns)
+static void userns_put(struct ns_common *ns)
 {
-        put_user_ns(ns);
+        put_user_ns(to_user_ns(ns));
 }
-static int userns_install(struct nsproxy *nsproxy, void *ns)
+static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 {
-        struct user_namespace *user_ns = ns;
+        struct user_namespace *user_ns = to_user_ns(ns);
        struct cred *cred;
        /* Don't allow gaining capabilities by reentering
@@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
        return commit_creds(cred);
 }
-static unsigned int userns_inum(void *ns)
-{
-        struct user_namespace *user_ns = ns;
-        return user_ns->proc_inum;
-}
 const struct proc_ns_operations userns_operations = {
        .name           = "user",
        .type           = CLONE_NEWUSER,
        .get            = userns_get,
        .put            = userns_put,
        .install        = userns_install,
-        .inum           = userns_inum,
 };
 static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 883aaaa7de8a..831ea7108232 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
        if (!ns)
                return ERR_PTR(-ENOMEM);
-        err = proc_alloc_inum(&ns->proc_inum);
+        err = ns_alloc_inum(&ns->ns);
        if (err) {
                kfree(ns);
                return ERR_PTR(err);
        }
+        ns->ns.ops = &utsns_operations;
        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
        ns->user_ns = get_user_ns(user_ns);
@@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref)
        ns = container_of(kref, struct uts_namespace, kref);
        put_user_ns(ns->user_ns);
-        proc_free_inum(ns->proc_inum);
+        ns_free_inum(&ns->ns);
        kfree(ns);
 }
-static void *utsns_get(struct task_struct *task)
+static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
+{
+        return container_of(ns, struct uts_namespace, ns);
+}
+static struct ns_common *utsns_get(struct task_struct *task)
 {
        struct uts_namespace *ns = NULL;
        struct nsproxy *nsproxy;
@@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task)
        }
        task_unlock(task);
-        return ns;
+        return ns ? &ns->ns : NULL;
 }
-static void utsns_put(void *ns)
+static void utsns_put(struct ns_common *ns)
 {
-        put_uts_ns(ns);
+        put_uts_ns(to_uts_ns(ns));
 }
-static int utsns_install(struct nsproxy *nsproxy, void *new)
+static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
 {
-        struct uts_namespace *ns = new;
+        struct uts_namespace *ns = to_uts_ns(new);
        if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
        return 0;
 }
-static unsigned int utsns_inum(void *vp)
-{
-        struct uts_namespace *ns = vp;
-        return ns->proc_inum;
-}
 const struct proc_ns_operations utsns_operations = {
        .name           = "uts",
        .type           = CLONE_NEWUTS,
        .get            = utsns_get,
        .put            = utsns_put,
        .install        = utsns_install,
-        .inum           = utsns_inum,
 };
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 09b685daee3d..beeeac9e0e3e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool)
        struct worker_pool *pool = (void *)__pool;
        struct work_struct *work;
-        spin_lock_irq(&wq_mayday_lock);         /* for wq->maydays */
+        spin_lock_irq(&pool->lock);
-        spin_lock(&pool->lock);
+        spin_lock(&wq_mayday_lock);             /* for wq->maydays */
        if (need_to_create_worker(pool)) {
                /*
@@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool)
                        send_mayday(work);
        }
-        spin_unlock(&pool->lock);
+        spin_unlock(&wq_mayday_lock);
-        spin_unlock_irq(&wq_mayday_lock);
+        spin_unlock_irq(&pool->lock);
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
@@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool)
 * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
 */
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
 __releases(&pool->lock)
 __acquires(&pool->lock)
 {
-        if (!need_to_create_worker(pool))
-                return false;
 restart:
        spin_unlock_irq(&pool->lock);
@@ -1877,7 +1871,6 @@ restart:
         */
        if (need_to_create_worker(pool))
                goto restart;
-        return true;
 }
 /**
@@ -1897,16 +1890,14 @@ restart:
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
- * %false if the pool don't need management and the caller can safely start
+ * %false if the pool doesn't need management and the caller can safely
- * processing works, %true indicates that the function released pool->lock
+ * start processing works, %true if management function was performed and
- * and reacquired it to perform some management function and that the
+ * the conditions that the caller verified before calling the function may
- * conditions that the caller verified while holding the lock before
+ * no longer be true.
- * calling the function might no longer be true.
 */
 static bool manage_workers(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        bool ret = false;
        /*
         * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
         * actual management, the pool may stall indefinitely.
         */
        if (!mutex_trylock(&pool->manager_arb))
-                return ret;
+                return false;
-        ret |= maybe_create_worker(pool);
+        maybe_create_worker(pool);
        mutex_unlock(&pool->manager_arb);
-        return ret;
+        return true;
 }
 /**
@@ -2248,12 +2239,30 @@ repeat:
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
-                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
+                WARN_ON_ONCE(!list_empty(scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry)
                        if (get_work_pwq(work) == pwq)
                                move_linked_works(work, scheduled, &n);
-                process_scheduled_works(rescuer);
+                if (!list_empty(scheduled)) {
+                        process_scheduled_works(rescuer);
+                        /*
+                         * The above execution of rescued work items could
+                         * have created more to rescue through
+                         * pwq_activate_first_delayed() or chained
+                         * queueing.  Let's put @pwq back on mayday list so
+                         * that such back-to-back work items, which may be
+                         * being used to relieve memory pressure, don't
+                         * incur MAYDAY_INTERVAL delay inbetween.
+                         */
+                        if (need_to_create_worker(pool)) {
+                                spin_lock(&wq_mayday_lock);
+                                get_pwq(pwq);
+                                list_move_tail(&pwq->mayday_node, &wq->maydays);
+                                spin_unlock(&wq_mayday_lock);
+                        }
+                }
                /*
                 * Put the reference grabbed by send_mayday().  @pool won't