134 files changed, 8915 insertions, 3800 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index b302b4731d16..72aa080f91f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_CRASH_CORE) += crash_core.o
 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
diff --git a/kernel/audit.c b/kernel/audit.c
index a871bf80fde1..4b7d49868ce1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,6 +58,8 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
 #include <linux/audit.h>
@@ -110,18 +112,19 @@ struct audit_net {
 * @pid: auditd PID
 * @portid: netlink portid
 * @net: the associated network namespace
- * @lock: spinlock to protect write access
+ * @rcu: RCU head
 *
 * Description:
 * This struct is RCU protected; you must either hold the RCU lock for reading
- * or the included spinlock for writing.
+ * or the associated spinlock for writing.
 */
 static struct auditd_connection {
-        int pid;
+        struct pid *pid;
        u32 portid;
        struct net *net;
-        spinlock_t lock;
+        struct rcu_head rcu;
-} auditd_conn;
+} *auditd_conn = NULL;
+static DEFINE_SPINLOCK(auditd_conn_lock);
 /* If audit_rate_limit is non-zero, limit the rate of sending audit records
 * to that number per second.  This prevents DoS attacks, but results in
@@ -151,12 +154,7 @@ static atomic_t	audit_lost = ATOMIC_INIT(0);
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
-/* The audit_freelist is a list of pre-allocated audit buffers (if more
+static struct kmem_cache *audit_buffer_cache;
- * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
- * being placed on the freelist). */
-static DEFINE_SPINLOCK(audit_freelist_lock);
-static int         audit_freelist_count;
-static LIST_HEAD(audit_freelist);
 /* queue msgs to send via kauditd_task */
 static struct sk_buff_head audit_queue;
@@ -191,17 +189,12 @@ DEFINE_MUTEX(audit_cmd_mutex);
 * should be at least that large. */
 #define AUDIT_BUFSIZ 1024
-/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
- * audit_freelist.  Doing so eliminates many kmalloc/kfree calls. */
-#define AUDIT_MAXFREE  (2*NR_CPUS)
 /* The audit_buffer is used when formatting an audit record.  The caller
 * locks briefly to get the record off the freelist or to allocate the
 * buffer, and locks briefly to send the buffer to the netlink layer or
 * to place it on a transmit queue.  Multiple audit_buffers can be in
 * use simultaneously. */
 struct audit_buffer {
-        struct list_head     list;
        struct sk_buff       *skb;      /* formatted skb ready to send */
        struct audit_context *ctx;      /* NULL or associated context */
        gfp_t                gfp_mask;
@@ -220,18 +213,42 @@ struct audit_reply {
 * Description:
 * Return 1 if the task is a registered audit daemon, 0 otherwise.
 */
-int auditd_test_task(const struct task_struct *task)
+int auditd_test_task(struct task_struct *task)
 {
        int rc;
+        struct auditd_connection *ac;
        rcu_read_lock();
-        rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0);
+        ac = rcu_dereference(auditd_conn);
+        rc = (ac && ac->pid == task_tgid(task) ? 1 : 0);
        rcu_read_unlock();
        return rc;
 }
 /**
+ * auditd_pid_vnr - Return the auditd PID relative to the namespace
+ *
+ * Description:
+ * Returns the PID in relation to the namespace, 0 on failure.
+ */
+static pid_t auditd_pid_vnr(void)
+{
+        pid_t pid;
+        const struct auditd_connection *ac;
+        rcu_read_lock();
+        ac = rcu_dereference(auditd_conn);
+        if (!ac || !ac->pid)
+                pid = 0;
+        else
+                pid = pid_vnr(ac->pid);
+        rcu_read_unlock();
+        return pid;
+}
+/**
 * audit_get_sk - Return the audit socket for the given network namespace
 * @net: the destination network namespace
 *
@@ -250,14 +267,6 @@ static struct sock *audit_get_sk(const struct net *net)
        return aunet->sk;
 }
-static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
-{
-        if (ab) {
-                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
-                nlh->nlmsg_pid = portid;
-        }
-}
 void audit_panic(const char *message)
 {
        switch (audit_failure) {
@@ -427,6 +436,24 @@ static int audit_set_failure(u32 state)
 }
 /**
+ * auditd_conn_free - RCU helper to release an auditd connection struct
+ * @rcu: RCU head
+ *
+ * Description:
+ * Drop any references inside the auditd connection tracking struct and free
+ * the memory.
+ */
+ static void auditd_conn_free(struct rcu_head *rcu)
+ {
+        struct auditd_connection *ac;
+        ac = container_of(rcu, struct auditd_connection, rcu);
+        put_pid(ac->pid);
+        put_net(ac->net);
+        kfree(ac);
+ }
+/**
 * auditd_set - Set/Reset the auditd connection state
 * @pid: auditd PID
 * @portid: auditd netlink portid
@@ -434,22 +461,33 @@ static int audit_set_failure(u32 state)
 *
 * Description:
 * This function will obtain and drop network namespace references as
- * necessary.
+ * necessary.  Returns zero on success, negative values on failure.
 */
-static void auditd_set(int pid, u32 portid, struct net *net)
+static int auditd_set(struct pid *pid, u32 portid, struct net *net)
 {
        unsigned long flags;
+        struct auditd_connection *ac_old, *ac_new;
-        spin_lock_irqsave(&auditd_conn.lock, flags);
+        if (!pid || !net)
-        auditd_conn.pid = pid;
+                return -EINVAL;
-        auditd_conn.portid = portid;
-        if (auditd_conn.net)
+        ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL);
-                put_net(auditd_conn.net);
+        if (!ac_new)
-        if (net)
+                return -ENOMEM;
-                auditd_conn.net = get_net(net);
+        ac_new->pid = get_pid(pid);
-        else
+        ac_new->portid = portid;
-                auditd_conn.net = NULL;
+        ac_new->net = get_net(net);
-        spin_unlock_irqrestore(&auditd_conn.lock, flags);
+        spin_lock_irqsave(&auditd_conn_lock, flags);
+        ac_old = rcu_dereference_protected(auditd_conn,
+                                           lockdep_is_held(&auditd_conn_lock));
+        rcu_assign_pointer(auditd_conn, ac_new);
+        spin_unlock_irqrestore(&auditd_conn_lock, flags);
+        if (ac_old)
+                call_rcu(&ac_old->rcu, auditd_conn_free);
+        return 0;
 }
 /**
@@ -544,13 +582,19 @@ static void kauditd_retry_skb(struct sk_buff *skb)
 */
 static void auditd_reset(void)
 {
+        unsigned long flags;
        struct sk_buff *skb;
+        struct auditd_connection *ac_old;
        /* if it isn't already broken, break the connection */
-        rcu_read_lock();
+        spin_lock_irqsave(&auditd_conn_lock, flags);
-        if (auditd_conn.pid)
+        ac_old = rcu_dereference_protected(auditd_conn,
-                auditd_set(0, 0, NULL);
+                                           lockdep_is_held(&auditd_conn_lock));
-        rcu_read_unlock();
+        rcu_assign_pointer(auditd_conn, NULL);
+        spin_unlock_irqrestore(&auditd_conn_lock, flags);
+        if (ac_old)
+                call_rcu(&ac_old->rcu, auditd_conn_free);
        /* flush all of the main and retry queues to the hold queue */
        while ((skb = skb_dequeue(&audit_retry_queue)))
@@ -576,6 +620,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
        u32 portid;
        struct net *net;
        struct sock *sk;
+        struct auditd_connection *ac;
        /* NOTE: we can't call netlink_unicast while in the RCU section so
         *       take a reference to the network namespace and grab local
@@ -585,15 +630,15 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
         *       section netlink_unicast() should safely return an error */
        rcu_read_lock();
-        if (!auditd_conn.pid) {
+        ac = rcu_dereference(auditd_conn);
+        if (!ac) {
                rcu_read_unlock();
                rc = -ECONNREFUSED;
                goto err;
        }
-        net = auditd_conn.net;
+        net = get_net(ac->net);
-        get_net(net);
        sk = audit_get_sk(net);
-        portid = auditd_conn.portid;
+        portid = ac->portid;
        rcu_read_unlock();
        rc = netlink_unicast(sk, skb, portid, 0);
@@ -728,6 +773,7 @@ static int kauditd_thread(void *dummy)
        u32 portid = 0;
        struct net *net = NULL;
        struct sock *sk = NULL;
+        struct auditd_connection *ac;
 #define UNICAST_RETRIES 5
@@ -735,14 +781,14 @@ static int kauditd_thread(void *dummy)
        while (!kthread_should_stop()) {
                /* NOTE: see the lock comments in auditd_send_unicast_skb() */
                rcu_read_lock();
-                if (!auditd_conn.pid) {
+                ac = rcu_dereference(auditd_conn);
+                if (!ac) {
                        rcu_read_unlock();
                        goto main_queue;
                }
-                net = auditd_conn.net;
+                net = get_net(ac->net);
-                get_net(net);
                sk = audit_get_sk(net);
-                portid = auditd_conn.portid;
+                portid = ac->portid;
                rcu_read_unlock();
                /* attempt to flush the hold queue */
@@ -816,7 +862,7 @@ int audit_send_list(void *_dest)
        return 0;
 }
-struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
+struct sk_buff *audit_make_reply(int seq, int type, int done,
                                 int multi, const void *payload, int size)
 {
        struct sk_buff  *skb;
@@ -829,7 +875,7 @@ struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
        if (!skb)
                return NULL;
-        nlh     = nlmsg_put(skb, portid, seq, t, size, flags);
+        nlh     = nlmsg_put(skb, 0, seq, t, size, flags);
        if (!nlh)
                goto out_kfree_skb;
        data = nlmsg_data(nlh);
@@ -873,7 +919,6 @@ static int audit_send_reply_thread(void *arg)
 static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
                             int multi, const void *payload, int size)
 {
-        u32 portid = NETLINK_CB(request_skb).portid;
        struct net *net = sock_net(NETLINK_CB(request_skb).sk);
        struct sk_buff *skb;
        struct task_struct *tsk;
@@ -883,12 +928,12 @@ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int
        if (!reply)
                return;
-        skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
+        skb = audit_make_reply(seq, type, done, multi, payload, size);
        if (!skb)
                goto out;
        reply->net = get_net(net);
-        reply->portid = portid;
+        reply->portid = NETLINK_CB(request_skb).portid;
        reply->skb = skb;
        tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -1068,11 +1113,13 @@ static int audit_set_feature(struct sk_buff *skb)
        return 0;
 }
-static int audit_replace(pid_t pid)
+static int audit_replace(struct pid *pid)
 {
+        pid_t pvnr;
        struct sk_buff *skb;
-        skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid));
+        pvnr = pid_vnr(pid);
+        skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr));
        if (!skb)
                return -ENOMEM;
        return auditd_send_unicast_skb(skb);
@@ -1102,9 +1149,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                memset(&s, 0, sizeof(s));
                s.enabled               = audit_enabled;
                s.failure               = audit_failure;
-                rcu_read_lock();
+                /* NOTE: use pid_vnr() so the PID is relative to the current
-                s.pid                   = auditd_conn.pid;
+                 *       namespace */
-                rcu_read_unlock();
+                s.pid                   = auditd_pid_vnr();
                s.rate_limit            = audit_rate_limit;
                s.backlog_limit         = audit_backlog_limit;
                s.lost                  = atomic_read(&audit_lost);
@@ -1130,51 +1177,61 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_PID) {
-                        /* NOTE: we are using task_tgid_vnr() below because
+                        /* NOTE: we are using the vnr PID functions below
-                         *       the s.pid value is relative to the namespace
+                         *       because the s.pid value is relative to the
-                         *       of the caller; at present this doesn't matter
+                         *       namespace of the caller; at present this
-                         *       much since you can really only run auditd
+                         *       doesn't matter much since you can really only
-                         *       from the initial pid namespace, but something
+                         *       run auditd from the initial pid namespace, but
-                         *       to keep in mind if this changes */
+                         *       something to keep in mind if this changes */
-                        int new_pid = s.pid;
+                        pid_t new_pid = s.pid;
                        pid_t auditd_pid;
-                        pid_t requesting_pid = task_tgid_vnr(current);
+                        struct pid *req_pid = task_tgid(current);
+                        /* sanity check - PID values must match */
+                        if (new_pid != pid_vnr(req_pid))
+                                return -EINVAL;
                        /* test the auditd connection */
-                        audit_replace(requesting_pid);
+                        audit_replace(req_pid);
-                        rcu_read_lock();
+                        auditd_pid = auditd_pid_vnr();
-                        auditd_pid = auditd_conn.pid;
                        /* only the current auditd can unregister itself */
-                        if ((!new_pid) && (requesting_pid != auditd_pid)) {
+                        if ((!new_pid) && (new_pid != auditd_pid)) {
-                                rcu_read_unlock();
                                audit_log_config_change("audit_pid", new_pid,
                                                        auditd_pid, 0);
                                return -EACCES;
                        }
                        /* replacing a healthy auditd is not allowed */
                        if (auditd_pid && new_pid) {
-                                rcu_read_unlock();
                                audit_log_config_change("audit_pid", new_pid,
                                                        auditd_pid, 0);
                                return -EEXIST;
                        }
-                        rcu_read_unlock();
-                        if (audit_enabled != AUDIT_OFF)
-                                audit_log_config_change("audit_pid", new_pid,
-                                                        auditd_pid, 1);
                        if (new_pid) {
                                /* register a new auditd connection */
-                                auditd_set(new_pid,
+                                err = auditd_set(req_pid,
-                                           NETLINK_CB(skb).portid,
+                                                 NETLINK_CB(skb).portid,
-                                           sock_net(NETLINK_CB(skb).sk));
+                                                 sock_net(NETLINK_CB(skb).sk));
+                                if (audit_enabled != AUDIT_OFF)
+                                        audit_log_config_change("audit_pid",
+                                                                new_pid,
+                                                                auditd_pid,
+                                                                err ? 0 : 1);
+                                if (err)
+                                        return err;
                                /* try to process any backlog */
                                wake_up_interruptible(&kauditd_wait);
-                        } else
+                        } else {
+                                if (audit_enabled != AUDIT_OFF)
+                                        audit_log_config_change("audit_pid",
+                                                                new_pid,
+                                                                auditd_pid, 1);
                                /* unregister the auditd connection */
                                auditd_reset();
+                        }
                }
                if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(s.rate_limit);
@@ -1242,7 +1299,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                        size--;
                                audit_log_n_untrustedstring(ab, data, size);
                        }
-                        audit_set_portid(ab, NETLINK_CB(skb).portid);
                        audit_log_end(ab);
                }
                break;
@@ -1256,8 +1312,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        audit_log_end(ab);
                        return -EPERM;
                }
-                err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
+                err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
-                                           seq, data, nlmsg_len(nlh));
                break;
        case AUDIT_LIST_RULES:
                err = audit_list_rules_send(skb, seq);
@@ -1378,11 +1433,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        return err < 0 ? err : 0;
 }
-/*
+/**
- * Get message from skb.  Each message is processed by audit_receive_msg.
+ * audit_receive - receive messages from a netlink control socket
- * Malformed skbs with wrong length are discarded silently.
+ * @skb: the message buffer
+ *
+ * Parse the provided skb and deal with any messages that may be present,
+ * malformed skbs are discarded.
 */
-static void audit_receive_skb(struct sk_buff *skb)
+static void audit_receive(struct sk_buff  *skb)
 {
        struct nlmsghdr *nlh;
        /*
@@ -1395,21 +1453,15 @@ static void audit_receive_skb(struct sk_buff *skb)
        nlh = nlmsg_hdr(skb);
        len = skb->len;
+        mutex_lock(&audit_cmd_mutex);
        while (nlmsg_ok(nlh, len)) {
                err = audit_receive_msg(skb, nlh);
                /* if err or if this message says it wants a response */
                if (err || (nlh->nlmsg_flags & NLM_F_ACK))
-                        netlink_ack(skb, nlh, err);
+                        netlink_ack(skb, nlh, err, NULL);
                nlh = nlmsg_next(nlh, &len);
        }
-}
-/* Receive messages from netlink socket. */
-static void audit_receive(struct sk_buff  *skb)
-{
-        mutex_lock(&audit_cmd_mutex);
-        audit_receive_skb(skb);
        mutex_unlock(&audit_cmd_mutex);
 }
@@ -1447,10 +1499,11 @@ static void __net_exit audit_net_exit(struct net *net)
 {
        struct audit_net *aunet = net_generic(net, audit_net_id);
-        rcu_read_lock();
+        /* NOTE: you would think that we would want to check the auditd
-        if (net == auditd_conn.net)
+         * connection and potentially reset it here if it lives in this
-                auditd_reset();
+         * namespace, but since the auditd connection tracking struct holds a
-        rcu_read_unlock();
+         * reference to this namespace (see auditd_set()) we are only ever
+         * going to get here after that connection has been released */
        netlink_kernel_release(aunet->sk);
 }
@@ -1470,8 +1523,9 @@ static int __init audit_init(void)
        if (audit_initialized == AUDIT_DISABLED)
                return 0;
-        memset(&auditd_conn, 0, sizeof(auditd_conn));
+        audit_buffer_cache = kmem_cache_create("audit_buffer",
-        spin_lock_init(&auditd_conn.lock);
+                                               sizeof(struct audit_buffer),
+                                               0, SLAB_PANIC, NULL);
        skb_queue_head_init(&audit_queue);
        skb_queue_head_init(&audit_retry_queue);
@@ -1538,60 +1592,33 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set);
 static void audit_buffer_free(struct audit_buffer *ab)
 {
-        unsigned long flags;
        if (!ab)
                return;
        kfree_skb(ab->skb);
-        spin_lock_irqsave(&audit_freelist_lock, flags);
+        kmem_cache_free(audit_buffer_cache, ab);
-        if (audit_freelist_count > AUDIT_MAXFREE)
-                kfree(ab);
-        else {
-                audit_freelist_count++;
-                list_add(&ab->list, &audit_freelist);
-        }
-        spin_unlock_irqrestore(&audit_freelist_lock, flags);
 }
-static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
+static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
-                                                gfp_t gfp_mask, int type)
+                                               gfp_t gfp_mask, int type)
 {
-        unsigned long flags;
+        struct audit_buffer *ab;
-        struct audit_buffer *ab = NULL;
-        struct nlmsghdr *nlh;
-        spin_lock_irqsave(&audit_freelist_lock, flags);
-        if (!list_empty(&audit_freelist)) {
-                ab = list_entry(audit_freelist.next,
-                                struct audit_buffer, list);
-                list_del(&ab->list);
-                --audit_freelist_count;
-        }
-        spin_unlock_irqrestore(&audit_freelist_lock, flags);
-        if (!ab) {
-                ab = kmalloc(sizeof(*ab), gfp_mask);
-                if (!ab)
-                        goto err;
-        }
-        ab->ctx = ctx;
+        ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask);
-        ab->gfp_mask = gfp_mask;
+        if (!ab)
+                return NULL;
        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
        if (!ab->skb)
                goto err;
+        if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
+                goto err;
-        nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+        ab->ctx = ctx;
-        if (!nlh)
+        ab->gfp_mask = gfp_mask;
-                goto out_kfree_skb;
        return ab;
-out_kfree_skb:
-        kfree_skb(ab->skb);
-        ab->skb = NULL;
 err:
        audit_buffer_free(ab);
        return NULL;
@@ -1622,10 +1649,10 @@ unsigned int audit_serial(void)
 }
 static inline void audit_get_stamp(struct audit_context *ctx,
-                                   struct timespec *t, unsigned int *serial)
+                                   struct timespec64 *t, unsigned int *serial)
 {
        if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
-                *t = CURRENT_TIME;
+                ktime_get_real_ts64(t);
                *serial = audit_serial();
        }
 }
@@ -1649,7 +1676,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                                     int type)
 {
        struct audit_buffer *ab;
-        struct timespec t;
+        struct timespec64 t;
        unsigned int uninitialized_var(serial);
        if (audit_initialized != AUDIT_INITIALIZED)
@@ -1702,8 +1729,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        }
        audit_get_stamp(ab->ctx, &t, &serial);
-        audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+        audit_log_format(ab, "audit(%llu.%03lu:%u): ",
-                         t.tv_sec, t.tv_nsec/1000000, serial);
+                         (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
        return ab;
 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 0d87f8ab8778..ddfce2ea4891 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -112,7 +112,7 @@ struct audit_context {
        enum audit_state    state, current_state;
        unsigned int        serial;     /* serial number for record */
        int                 major;      /* syscall number */
-        struct timespec     ctime;      /* time of syscall entry */
+        struct timespec64   ctime;      /* time of syscall entry */
        unsigned long       argv[4];    /* syscall arguments */
        long                return_code;/* syscall return code */
        u64                 prio;
@@ -218,7 +218,7 @@ extern void audit_log_name(struct audit_context *context,
                           struct audit_names *n, const struct path *path,
                           int record_num, int *call_panic);
-extern int auditd_test_task(const struct task_struct *task);
+extern int auditd_test_task(struct task_struct *task);
 #define AUDIT_INODE_BUCKETS     32
 extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -237,8 +237,7 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
 extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
 extern int parent_len(const char *path);
 extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
-extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
+extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
-                                        int done, int multi,
                                        const void *payload, int size);
 extern void                 audit_panic(const char *message);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 7ea57e516029..52f368b6561e 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -103,15 +103,15 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
                goto out;
        }
-        fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
+        fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group);
        audit_mark->mark.mask = AUDIT_FS_EVENTS;
        audit_mark->path = pathname;
        audit_update_mark(audit_mark, dentry->d_inode);
        audit_mark->rule = krule;
-        ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
+        ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true);
        if (ret < 0) {
-                audit_fsnotify_mark_free(audit_mark);
+                fsnotify_put_mark(&audit_mark->mark);
                audit_mark = ERR_PTR(ret);
        }
 out:
@@ -168,7 +168,8 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
                                    struct fsnotify_mark *inode_mark,
                                    struct fsnotify_mark *vfsmount_mark,
                                    u32 mask, const void *data, int data_type,
-                                    const unsigned char *dname, u32 cookie)
+                                    const unsigned char *dname, u32 cookie,
+                                    struct fsnotify_iter_info *iter_info)
 {
        struct audit_fsnotify_mark *audit_mark;
        const struct inode *inode = NULL;
@@ -187,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
        default:
                BUG();
                return 0;
-        };
+        }
        if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
                if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
@@ -201,6 +202,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
 static const struct fsnotify_ops audit_mark_fsnotify_ops = {
        .handle_event = audit_mark_handle_event,
+        .free_mark = audit_fsnotify_free_mark,
 };
 static int __init audit_fsnotify_init(void)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7b44195da81b..011d46e5f73f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,13 +3,14 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/kthread.h>
+#include <linux/refcount.h>
 #include <linux/slab.h>
 struct audit_tree;
 struct audit_chunk;
 struct audit_tree {
-        atomic_t count;
+        refcount_t count;
        int goner;
        struct audit_chunk *root;
        struct list_head chunks;
@@ -77,7 +78,7 @@ static struct audit_tree *alloc_tree(const char *s)
        tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
        if (tree) {
-                atomic_set(&tree->count, 1);
+                refcount_set(&tree->count, 1);
                tree->goner = 0;
                INIT_LIST_HEAD(&tree->chunks);
                INIT_LIST_HEAD(&tree->rules);
@@ -91,12 +92,12 @@ static struct audit_tree *alloc_tree(const char *s)
 static inline void get_tree(struct audit_tree *tree)
 {
-        atomic_inc(&tree->count);
+        refcount_inc(&tree->count);
 }
 static inline void put_tree(struct audit_tree *tree)
 {
-        if (atomic_dec_and_test(&tree->count))
+        if (refcount_dec_and_test(&tree->count))
                kfree_rcu(tree, head);
 }
@@ -154,7 +155,7 @@ static struct audit_chunk *alloc_chunk(int count)
                INIT_LIST_HEAD(&chunk->owners[i].list);
                chunk->owners[i].index = i;
        }
-        fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+        fsnotify_init_mark(&chunk->mark, audit_tree_group);
        chunk->mark.mask = FS_IN_IGNORED;
        return chunk;
 }
@@ -163,33 +164,54 @@ enum {HASH_SIZE = 128};
 static struct list_head chunk_hash_heads[HASH_SIZE];
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
-static inline struct list_head *chunk_hash(const struct inode *inode)
+/* Function to return search key in our hash from inode. */
+static unsigned long inode_to_key(const struct inode *inode)
 {
-        unsigned long n = (unsigned long)inode / L1_CACHE_BYTES;
+        return (unsigned long)inode;
+}
+/*
+ * Function to return search key in our hash from chunk. Key 0 is special and
+ * should never be present in the hash.
+ */
+static unsigned long chunk_to_key(struct audit_chunk *chunk)
+{
+        /*
+         * We have a reference to the mark so it should be attached to a
+         * connector.
+         */
+        if (WARN_ON_ONCE(!chunk->mark.connector))
+                return 0;
+        return (unsigned long)chunk->mark.connector->inode;
+}
+static inline struct list_head *chunk_hash(unsigned long key)
+{
+        unsigned long n = key / L1_CACHE_BYTES;
        return chunk_hash_heads + n % HASH_SIZE;
 }
 /* hash_lock & entry->lock is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
-        struct fsnotify_mark *entry = &chunk->mark;
+        unsigned long key = chunk_to_key(chunk);
        struct list_head *list;
-        if (!entry->inode)
+        if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
                return;
-        list = chunk_hash(entry->inode);
+        list = chunk_hash(key);
        list_add_rcu(&chunk->hash, list);
 }
 /* called under rcu_read_lock */
 struct audit_chunk *audit_tree_lookup(const struct inode *inode)
 {
-        struct list_head *list = chunk_hash(inode);
+        unsigned long key = inode_to_key(inode);
+        struct list_head *list = chunk_hash(key);
        struct audit_chunk *p;
        list_for_each_entry_rcu(p, list, hash) {
-                /* mark.inode may have gone NULL, but who cares? */
+                if (chunk_to_key(p) == key) {
-                if (p->mark.inode == inode) {
                        atomic_long_inc(&p->refs);
                        return p;
                }
@@ -233,11 +255,15 @@ static void untag_chunk(struct node *p)
        mutex_lock(&entry->group->mark_mutex);
        spin_lock(&entry->lock);
-        if (chunk->dead || !entry->inode) {
+        /*
+         * mark_mutex protects mark from getting detached and thus also from
+         * mark->connector->inode getting NULL.
+         */
+        if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                spin_unlock(&entry->lock);
                mutex_unlock(&entry->group->mark_mutex);
                if (new)
-                        free_chunk(new);
+                        fsnotify_put_mark(&new->mark);
                goto out;
        }
@@ -261,7 +287,7 @@ static void untag_chunk(struct node *p)
        if (!new)
                goto Fallback;
-        if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode,
+        if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode,
                                     NULL, 1)) {
                fsnotify_put_mark(&new->mark);
                goto Fallback;
@@ -327,7 +353,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
                return -ENOMEM;
        entry = &chunk->mark;
-        if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
+        if (fsnotify_add_mark(entry, inode, NULL, 0)) {
                fsnotify_put_mark(entry);
                return -ENOSPC;
        }
@@ -366,7 +392,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        struct node *p;
        int n;
-        old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+        old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks,
+                                       audit_tree_group);
        if (!old_entry)
                return create_chunk(inode, tree);
@@ -393,17 +420,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        mutex_lock(&old_entry->group->mark_mutex);
        spin_lock(&old_entry->lock);
-        if (!old_entry->inode) {
+        /*
+         * mark_mutex protects mark from getting detached and thus also from
+         * mark->connector->inode getting NULL.
+         */
+        if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                /* old_entry is being shot, lets just lie */
                spin_unlock(&old_entry->lock);
                mutex_unlock(&old_entry->group->mark_mutex);
                fsnotify_put_mark(old_entry);
-                free_chunk(chunk);
+                fsnotify_put_mark(&chunk->mark);
                return -ENOENT;
        }
-        if (fsnotify_add_mark_locked(chunk_entry, old_entry->group,
+        if (fsnotify_add_mark_locked(chunk_entry,
-                                     old_entry->inode, NULL, 1)) {
+                             old_entry->connector->inode, NULL, 1)) {
                spin_unlock(&old_entry->lock);
                mutex_unlock(&old_entry->group->mark_mutex);
                fsnotify_put_mark(chunk_entry);
@@ -588,7 +619,8 @@ int audit_remove_tree_rule(struct audit_krule *rule)
 static int compare_root(struct vfsmount *mnt, void *arg)
 {
-        return d_backing_inode(mnt->mnt_root) == arg;
+        return inode_to_key(d_backing_inode(mnt->mnt_root)) ==
+               (unsigned long)arg;
 }
 void audit_trim_trees(void)
@@ -623,9 +655,10 @@ void audit_trim_trees(void)
                list_for_each_entry(node, &tree->chunks, list) {
                        struct audit_chunk *chunk = find_chunk(node);
                        /* this could be NULL if the watch is dying else where... */
-                        struct inode *inode = chunk->mark.inode;
                        node->index |= 1U<<31;
-                        if (iterate_mounts(compare_root, inode, root_mnt))
+                        if (iterate_mounts(compare_root,
+                                           (void *)chunk_to_key(chunk),
+                                           root_mnt))
                                node->index &= ~(1U<<31);
                }
                spin_unlock(&hash_lock);
@@ -958,7 +991,8 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
                                   struct fsnotify_mark *inode_mark,
                                   struct fsnotify_mark *vfsmount_mark,
                                   u32 mask, const void *data, int data_type,
-                                   const unsigned char *file_name, u32 cookie)
+                                   const unsigned char *file_name, u32 cookie,
+                                   struct fsnotify_iter_info *iter_info)
 {
        return 0;
 }
@@ -979,6 +1013,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 static const struct fsnotify_ops audit_tree_ops = {
        .handle_event = audit_tree_handle_event,
        .freeing_mark = audit_tree_freeing_mark,
+        .free_mark = audit_tree_destroy_watch,
 };
 static int __init audit_tree_init(void)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f79e4658433d..62d686d96581 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -28,6 +28,7 @@
 #include <linux/fsnotify_backend.h>
 #include <linux/namei.h>
 #include <linux/netlink.h>
+#include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/security.h>
@@ -46,7 +47,7 @@
 */
 struct audit_watch {
-        atomic_t                count;  /* reference count */
+        refcount_t              count;  /* reference count */
        dev_t                   dev;    /* associated superblock device */
        char                    *path;  /* insertion path */
        unsigned long           ino;    /* associated inode number */
@@ -102,7 +103,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
        struct audit_parent *parent = NULL;
        struct fsnotify_mark *entry;
-        entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+        entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group);
        if (entry)
                parent = container_of(entry, struct audit_parent, mark);
@@ -111,12 +112,12 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
 void audit_get_watch(struct audit_watch *watch)
 {
-        atomic_inc(&watch->count);
+        refcount_inc(&watch->count);
 }
 void audit_put_watch(struct audit_watch *watch)
 {
-        if (atomic_dec_and_test(&watch->count)) {
+        if (refcount_dec_and_test(&watch->count)) {
                WARN_ON(watch->parent);
                WARN_ON(!list_empty(&watch->rules));
                kfree(watch->path);
@@ -157,9 +158,9 @@ static struct audit_parent *audit_init_parent(struct path *path)
        INIT_LIST_HEAD(&parent->watches);
-        fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+        fsnotify_init_mark(&parent->mark, audit_watch_group);
        parent->mark.mask = AUDIT_FS_WATCH;
-        ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+        ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0);
        if (ret < 0) {
                audit_free_parent(parent);
                return ERR_PTR(ret);
@@ -178,7 +179,7 @@ static struct audit_watch *audit_init_watch(char *path)
                return ERR_PTR(-ENOMEM);
        INIT_LIST_HEAD(&watch->rules);
-        atomic_set(&watch->count, 1);
+        refcount_set(&watch->count, 1);
        watch->path = path;
        watch->dev = AUDIT_DEV_UNSET;
        watch->ino = AUDIT_INO_UNSET;
@@ -472,7 +473,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
                                    struct fsnotify_mark *inode_mark,
                                    struct fsnotify_mark *vfsmount_mark,
                                    u32 mask, const void *data, int data_type,
-                                    const unsigned char *dname, u32 cookie)
+                                    const unsigned char *dname, u32 cookie,
+                                    struct fsnotify_iter_info *iter_info)
 {
        const struct inode *inode;
        struct audit_parent *parent;
@@ -492,7 +494,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
                BUG();
                inode = NULL;
                break;
-        };
+        }
        if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
                audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
@@ -506,6 +508,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 static const struct fsnotify_ops audit_watch_fsnotify_ops = {
        .handle_event =         audit_watch_handle_event,
+        .free_mark =            audit_watch_free_mark,
 };
 static int __init audit_watch_init(void)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 880519d6cf2a..0b0aa5854dac 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -338,7 +338,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
                    entry->rule.listnr != AUDIT_FILTER_USER)
                        return -EINVAL;
                break;
-        };
+        }
        switch(f->type) {
        default:
@@ -412,7 +412,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
                if (entry->rule.listnr != AUDIT_FILTER_EXIT)
                        return -EINVAL;
                break;
-        };
+        }
        return 0;
 }
@@ -1033,7 +1033,7 @@ out:
 }
 /* List rules using struct audit_rule_data. */
-static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(int seq, struct sk_buff_head *q)
 {
        struct sk_buff *skb;
        struct audit_krule *r;
@@ -1048,15 +1048,15 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
                        data = audit_krule_to_data(r);
                        if (unlikely(!data))
                                break;
-                        skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
+                        skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
-                                               0, 1, data,
+                                               data,
                                               sizeof(*data) + data->buflen);
                        if (skb)
                                skb_queue_tail(q, skb);
                        kfree(data);
                }
        }
-        skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+        skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
        if (skb)
                skb_queue_tail(q, skb);
 }
@@ -1085,13 +1085,11 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
 /**
 * audit_rule_change - apply all rules to the specified message type
 * @type: audit message type
- * @portid: target port id for netlink audit messages
 * @seq: netlink audit message sequence (serial) number
 * @data: payload data
 * @datasz: size of payload data
 */
-int audit_rule_change(int type, __u32 portid, int seq, void *data,
+int audit_rule_change(int type, int seq, void *data, size_t datasz)
-                        size_t datasz)
 {
        int err = 0;
        struct audit_entry *entry;
@@ -1150,7 +1148,7 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
        skb_queue_head_init(&dest->q);
        mutex_lock(&audit_filter_mutex);
-        audit_list_rules(portid, seq, &dest->q);
+        audit_list_rules(seq, &dest->q);
        mutex_unlock(&audit_filter_mutex);
        tsk = kthread_run(audit_send_list, dest, "audit_send_list");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c2333155893..bb724baa7ac9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
+#include <linux/fsnotify_backend.h>
 #include <uapi/linux/limits.h>
 #include "audit.h"
@@ -1532,7 +1533,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
                return;
        context->serial     = 0;
-        context->ctime      = CURRENT_TIME;
+        ktime_get_real_ts64(&context->ctime);
        context->in_syscall = 1;
        context->current_state  = state;
        context->ppid       = 0;
@@ -1596,7 +1597,7 @@ static inline void handle_one(const struct inode *inode)
        struct audit_tree_refs *p;
        struct audit_chunk *chunk;
        int count;
-        if (likely(hlist_empty(&inode->i_fsnotify_marks)))
+        if (likely(!inode->i_fsnotify_marks))
                return;
        context = current->audit_context;
        p = context->trees;
@@ -1639,7 +1640,7 @@ retry:
        seq = read_seqbegin(&rename_lock);
        for(;;) {
                struct inode *inode = d_backing_inode(d);
-                if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
+                if (inode && unlikely(inode->i_fsnotify_marks)) {
                        struct audit_chunk *chunk;
                        chunk = audit_tree_lookup(inode);
                        if (chunk) {
@@ -1941,13 +1942,13 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
 /**
 * auditsc_get_stamp - get local copies of audit_context values
 * @ctx: audit_context for the task
- * @t: timespec to store time recorded in the audit_context
+ * @t: timespec64 to store time recorded in the audit_context
 * @serial: serial value that is recorded in the audit_context
 *
 * Also sets the context as auditable.
 */
 int auditsc_get_stamp(struct audit_context *ctx,
-                       struct timespec *t, unsigned int *serial)
+                       struct timespec64 *t, unsigned int *serial)
 {
        if (!ctx->in_syscall)
                return 0;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1ce4f4fd7fd..e1e5e658f2db 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 6b6f41f0b211..5e00b2333c26 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016,2017 Facebook
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
@@ -16,6 +17,8 @@
 #include <linux/filter.h>
 #include <linux/perf_event.h>
+#include "map_in_map.h"
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
        int i;
@@ -113,6 +116,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
        return array->value + array->elem_size * index;
 }
+/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
+static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+        struct bpf_insn *insn = insn_buf;
+        u32 elem_size = round_up(map->value_size, 8);
+        const int ret = BPF_REG_0;
+        const int map_ptr = BPF_REG_1;
+        const int index = BPF_REG_2;
+        *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+        *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+        *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+        if (is_power_of_2(elem_size)) {
+                *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+        } else {
+                *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+        }
+        *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+        *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+        *insn++ = BPF_MOV64_IMM(ret, 0);
+        return insn - insn_buf;
+}
 /* Called from eBPF program */
 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
@@ -155,7 +182,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
-        u32 index = *(u32 *)key;
+        u32 index = key ? *(u32 *)key : U32_MAX;
        u32 *next = (u32 *)next_key;
        if (index >= array->map.max_entries) {
@@ -260,21 +287,17 @@ static void array_map_free(struct bpf_map *map)
        bpf_map_area_free(array);
 }
-static const struct bpf_map_ops array_ops = {
+const struct bpf_map_ops array_map_ops = {
        .map_alloc = array_map_alloc,
        .map_free = array_map_free,
        .map_get_next_key = array_map_get_next_key,
        .map_lookup_elem = array_map_lookup_elem,
        .map_update_elem = array_map_update_elem,
        .map_delete_elem = array_map_delete_elem,
+        .map_gen_lookup = array_map_gen_lookup,
 };
-static struct bpf_map_type_list array_type __ro_after_init = {
+const struct bpf_map_ops percpu_array_map_ops = {
-        .ops = &array_ops,
-        .type = BPF_MAP_TYPE_ARRAY,
-};
-static const struct bpf_map_ops percpu_array_ops = {
        .map_alloc = array_map_alloc,
        .map_free = array_map_free,
        .map_get_next_key = array_map_get_next_key,
@@ -283,19 +306,6 @@ static const struct bpf_map_ops percpu_array_ops = {
        .map_delete_elem = array_map_delete_elem,
 };
-static struct bpf_map_type_list percpu_array_type __ro_after_init = {
-        .ops = &percpu_array_ops,
-        .type = BPF_MAP_TYPE_PERCPU_ARRAY,
-};
-static int __init register_array_map(void)
-{
-        bpf_register_map_type(&array_type);
-        bpf_register_map_type(&percpu_array_type);
-        return 0;
-}
-late_initcall(register_array_map);
 static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
        /* only file descriptors can be stored in this type of map */
@@ -399,7 +409,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
                fd_array_map_delete_elem(map, &i);
 }
-static const struct bpf_map_ops prog_array_ops = {
+const struct bpf_map_ops prog_array_map_ops = {
        .map_alloc = fd_array_map_alloc,
        .map_free = fd_array_map_free,
        .map_get_next_key = array_map_get_next_key,
@@ -409,18 +419,6 @@ static const struct bpf_map_ops prog_array_ops = {
        .map_fd_put_ptr = prog_fd_array_put_ptr,
 };
-static struct bpf_map_type_list prog_array_type __ro_after_init = {
-        .ops = &prog_array_ops,
-        .type = BPF_MAP_TYPE_PROG_ARRAY,
-};
-static int __init register_prog_array_map(void)
-{
-        bpf_register_map_type(&prog_array_type);
-        return 0;
-}
-late_initcall(register_prog_array_map);
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
                                                   struct file *map_file)
 {
@@ -511,7 +509,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
        rcu_read_unlock();
 }
-static const struct bpf_map_ops perf_event_array_ops = {
+const struct bpf_map_ops perf_event_array_map_ops = {
        .map_alloc = fd_array_map_alloc,
        .map_free = fd_array_map_free,
        .map_get_next_key = array_map_get_next_key,
@@ -522,18 +520,6 @@ static const struct bpf_map_ops perf_event_array_ops = {
        .map_release = perf_event_fd_array_release,
 };
-static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
-        .ops = &perf_event_array_ops,
-        .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-};
-static int __init register_perf_event_array_map(void)
-{
-        bpf_register_map_type(&perf_event_array_type);
-        return 0;
-}
-late_initcall(register_perf_event_array_map);
 #ifdef CONFIG_CGROUPS
 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
                                     struct file *map_file /* not used */,
@@ -554,7 +540,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)
        fd_array_map_free(map);
 }
-static const struct bpf_map_ops cgroup_array_ops = {
+const struct bpf_map_ops cgroup_array_map_ops = {
        .map_alloc = fd_array_map_alloc,
        .map_free = cgroup_fd_array_free,
        .map_get_next_key = array_map_get_next_key,
@@ -563,16 +549,53 @@ static const struct bpf_map_ops cgroup_array_ops = {
        .map_fd_get_ptr = cgroup_fd_array_get_ptr,
        .map_fd_put_ptr = cgroup_fd_array_put_ptr,
 };
+#endif
-static struct bpf_map_type_list cgroup_array_type __ro_after_init = {
+static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
-        .ops = &cgroup_array_ops,
+{
-        .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+        struct bpf_map *map, *inner_map_meta;
-};
+        inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+        if (IS_ERR(inner_map_meta))
+                return inner_map_meta;
-static int __init register_cgroup_array_map(void)
+        map = fd_array_map_alloc(attr);
+        if (IS_ERR(map)) {
+                bpf_map_meta_free(inner_map_meta);
+                return map;
+        }
+        map->inner_map_meta = inner_map_meta;
+        return map;
+}
+static void array_of_map_free(struct bpf_map *map)
 {
-        bpf_register_map_type(&cgroup_array_type);
+        /* map->inner_map_meta is only accessed by syscall which
-        return 0;
+         * is protected by fdget/fdput.
+         */
+        bpf_map_meta_free(map->inner_map_meta);
+        bpf_fd_array_map_clear(map);
+        fd_array_map_free(map);
 }
-late_initcall(register_cgroup_array_map);
-#endif
+static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+        struct bpf_map **inner_map = array_map_lookup_elem(map, key);
+        if (!inner_map)
+                return NULL;
+        return READ_ONCE(*inner_map);
+}
+const struct bpf_map_ops array_of_maps_map_ops = {
+        .map_alloc = array_of_map_alloc,
+        .map_free = array_of_map_free,
+        .map_get_next_key = array_map_get_next_key,
+        .map_lookup_elem = array_of_map_lookup_elem,
+        .map_delete_elem = fd_array_map_delete_elem,
+        .map_fd_get_ptr = bpf_map_fd_get_ptr,
+        .map_fd_put_ptr = bpf_map_fd_put_ptr,
+};
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index f62d1d56f41d..e6ef4401a138 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,7 +13,7 @@
 #define LOCAL_FREE_TARGET               (128)
 #define LOCAL_NR_SCANS                  LOCAL_FREE_TARGET
-#define PERCPU_FREE_TARGET              (16)
+#define PERCPU_FREE_TARGET              (4)
 #define PERCPU_NR_SCANS                 PERCPU_FREE_TARGET
 /* Helpers to get the local list index */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index da0f53690295..ea6033cba947 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -154,7 +154,7 @@ int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
 /**
 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
- * @sk: The socken sending or receiving traffic
+ * @sk: The socket sending or receiving traffic
 * @skb: The skb that is being sent or received
 * @type: The type of program to be exectuted
 *
@@ -189,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
        prog = rcu_dereference(cgrp->bpf.effective[type]);
        if (prog) {
                unsigned int offset = skb->data - skb_network_header(skb);
+                struct sock *save_sk = skb->sk;
+                skb->sk = sk;
                __skb_push(skb, offset);
                ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
                __skb_pull(skb, offset);
+                skb->sk = save_sk;
        }
        rcu_read_unlock();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b4f1cb0c5ac7..dedf367f59bb 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -76,8 +76,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 {
-        gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
-                          gfp_extra_flags;
        struct bpf_prog_aux *aux;
        struct bpf_prog *fp;
@@ -107,8 +106,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_alloc);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags)
 {
-        gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
-                          gfp_extra_flags;
        struct bpf_prog *fp;
        u32 pages, delta;
        int ret;
@@ -394,27 +392,23 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
-        unsigned long flags;
        if (!bpf_prog_kallsyms_candidate(fp) ||
            !capable(CAP_SYS_ADMIN))
                return;
-        spin_lock_irqsave(&bpf_lock, flags);
+        spin_lock_bh(&bpf_lock);
        bpf_prog_ksym_node_add(fp->aux);
-        spin_unlock_irqrestore(&bpf_lock, flags);
+        spin_unlock_bh(&bpf_lock);
 }
 void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 {
-        unsigned long flags;
        if (!bpf_prog_kallsyms_candidate(fp))
                return;
-        spin_lock_irqsave(&bpf_lock, flags);
+        spin_lock_bh(&bpf_lock);
        bpf_prog_ksym_node_del(fp->aux);
-        spin_unlock_irqrestore(&bpf_lock, flags);
+        spin_unlock_bh(&bpf_lock);
 }
 static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
@@ -659,8 +653,7 @@ out:
 static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
                                              gfp_t gfp_extra_flags)
 {
-        gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
-                          gfp_extra_flags;
        struct bpf_prog *fp;
        fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 361a69dfe543..004334ea13ba 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -16,6 +16,7 @@
 #include <linux/rculist_nulls.h>
 #include "percpu_freelist.h"
 #include "bpf_lru_list.h"
+#include "map_in_map.h"
 struct bucket {
        struct hlist_nulls_head head;
@@ -86,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size
        return *(void __percpu **)(l->key + key_size);
 }
+static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
+{
+        return *(void **)(l->key + roundup(map->key_size, 8));
+}
 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
 {
        return (struct htab_elem *) (htab->elems + i * htab->elem_size);
@@ -426,7 +432,11 @@ again:
        return NULL;
 }
-/* Called from syscall or from eBPF program */
+/* Called from syscall or from eBPF program directly, so
+ * arguments have to match bpf_map_lookup_elem() exactly.
+ * The return value is adjusted by BPF instructions
+ * in htab_map_gen_lookup().
+ */
 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 {
        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -458,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
        return NULL;
 }
+/* inline bpf_map_lookup_elem() call.
+ * Instead of:
+ * bpf_prog
+ *   bpf_map_lookup_elem
+ *     map->ops->map_lookup_elem
+ *       htab_map_lookup_elem
+ *         __htab_map_lookup_elem
+ * do:
+ * bpf_prog
+ *   __htab_map_lookup_elem
+ */
+static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+        struct bpf_insn *insn = insn_buf;
+        const int ret = BPF_REG_0;
+        *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+        *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+        *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+                                offsetof(struct htab_elem, key) +
+                                round_up(map->key_size, 8));
+        return insn - insn_buf;
+}
 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
 {
        struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -506,12 +540,15 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
        struct hlist_nulls_head *head;
        struct htab_elem *l, *next_l;
        u32 hash, key_size;
-        int i;
+        int i = 0;
        WARN_ON_ONCE(!rcu_read_lock_held());
        key_size = map->key_size;
+        if (!key)
+                goto find_first_elem;
        hash = htab_map_hash(key, key_size);
        head = select_bucket(htab, hash);
@@ -519,10 +556,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
        /* lookup the key */
        l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
-        if (!l) {
+        if (!l)
-                i = 0;
                goto find_first_elem;
-        }
        /* key was found, get next key in the same bucket */
        next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
@@ -582,6 +617,14 @@ static void htab_elem_free_rcu(struct rcu_head *head)
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
+        struct bpf_map *map = &htab->map;
+        if (map->ops->map_fd_put_ptr) {
+                void *ptr = fd_htab_map_get_ptr(map, l);
+                map->ops->map_fd_put_ptr(ptr);
+        }
        if (htab_is_prealloc(htab)) {
                pcpu_freelist_push(&htab->freelist, &l->fnode);
        } else {
@@ -1027,6 +1070,7 @@ static void delete_all_elements(struct bpf_htab *htab)
                }
        }
 }
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void htab_map_free(struct bpf_map *map)
 {
@@ -1053,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map)
        kfree(htab);
 }
-static const struct bpf_map_ops htab_ops = {
+const struct bpf_map_ops htab_map_ops = {
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
        .map_lookup_elem = htab_map_lookup_elem,
        .map_update_elem = htab_map_update_elem,
        .map_delete_elem = htab_map_delete_elem,
+        .map_gen_lookup = htab_map_gen_lookup,
 };
-static struct bpf_map_type_list htab_type __ro_after_init = {
+const struct bpf_map_ops htab_lru_map_ops = {
-        .ops = &htab_ops,
-        .type = BPF_MAP_TYPE_HASH,
-};
-static const struct bpf_map_ops htab_lru_ops = {
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
@@ -1076,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = {
        .map_delete_elem = htab_lru_map_delete_elem,
 };
-static struct bpf_map_type_list htab_lru_type __ro_after_init = {
-        .ops = &htab_lru_ops,
-        .type = BPF_MAP_TYPE_LRU_HASH,
-};
 /* Called from eBPF program */
 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 {
@@ -1154,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
        return ret;
 }
-static const struct bpf_map_ops htab_percpu_ops = {
+const struct bpf_map_ops htab_percpu_map_ops = {
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
@@ -1163,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
        .map_delete_elem = htab_map_delete_elem,
 };
-static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
+const struct bpf_map_ops htab_lru_percpu_map_ops = {
-        .ops = &htab_percpu_ops,
-        .type = BPF_MAP_TYPE_PERCPU_HASH,
-};
-static const struct bpf_map_ops htab_lru_percpu_ops = {
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
@@ -1177,17 +1207,102 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
        .map_delete_elem = htab_lru_map_delete_elem,
 };
-static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
+static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
-        .ops = &htab_lru_percpu_ops,
+{
-        .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
+        struct bpf_map *map;
-};
+        if (attr->value_size != sizeof(u32))
+                return ERR_PTR(-EINVAL);
+        /* pointer is stored internally */
+        attr->value_size = sizeof(void *);
+        map = htab_map_alloc(attr);
+        attr->value_size = sizeof(u32);
-static int __init register_htab_map(void)
+        return map;
+}
+static void fd_htab_map_free(struct bpf_map *map)
 {
-        bpf_register_map_type(&htab_type);
+        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-        bpf_register_map_type(&htab_percpu_type);
+        struct hlist_nulls_node *n;
-        bpf_register_map_type(&htab_lru_type);
+        struct hlist_nulls_head *head;
-        bpf_register_map_type(&htab_lru_percpu_type);
+        struct htab_elem *l;
-        return 0;
+        int i;
+        for (i = 0; i < htab->n_buckets; i++) {
+                head = select_bucket(htab, i);
+                hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+                        void *ptr = fd_htab_map_get_ptr(map, l);
+                        map->ops->map_fd_put_ptr(ptr);
+                }
+        }
+        htab_map_free(map);
+}
+/* only called from syscall */
+int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
+                                void *key, void *value, u64 map_flags)
+{
+        void *ptr;
+        int ret;
+        u32 ufd = *(u32 *)value;
+        ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+        if (IS_ERR(ptr))
+                return PTR_ERR(ptr);
+        ret = htab_map_update_elem(map, key, &ptr, map_flags);
+        if (ret)
+                map->ops->map_fd_put_ptr(ptr);
+        return ret;
+}
+static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
+{
+        struct bpf_map *map, *inner_map_meta;
+        inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+        if (IS_ERR(inner_map_meta))
+                return inner_map_meta;
+        map = fd_htab_map_alloc(attr);
+        if (IS_ERR(map)) {
+                bpf_map_meta_free(inner_map_meta);
+                return map;
+        }
+        map->inner_map_meta = inner_map_meta;
+        return map;
 }
-late_initcall(register_htab_map);
+static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+        struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
+        if (!inner_map)
+                return NULL;
+        return READ_ONCE(*inner_map);
+}
+static void htab_of_map_free(struct bpf_map *map)
+{
+        bpf_map_meta_free(map->inner_map_meta);
+        fd_htab_map_free(map);
+}
+const struct bpf_map_ops htab_of_maps_map_ops = {
+        .map_alloc = htab_of_map_alloc,
+        .map_free = htab_of_map_free,
+        .map_get_next_key = htab_map_get_next_key,
+        .map_lookup_elem = htab_of_map_lookup_elem,
+        .map_delete_elem = htab_map_delete_elem,
+        .map_fd_get_ptr = bpf_map_fd_get_ptr,
+        .map_fd_put_ptr = bpf_map_fd_put_ptr,
+};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index fddcae801724..9bbd33497d3d 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -429,7 +429,7 @@ static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
 static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 {
-        static struct tree_descr bpf_rfiles[] = { { "" } };
+        static const struct tree_descr bpf_rfiles[] = { { "" } };
        struct bpf_mount_opts opts;
        struct inode *inode;
        int ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b37bd9ab7f57..39cfafd895b8 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -505,7 +505,7 @@ static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
        return -ENOTSUPP;
 }
-static const struct bpf_map_ops trie_ops = {
+const struct bpf_map_ops trie_map_ops = {
        .map_alloc = trie_alloc,
        .map_free = trie_free,
        .map_get_next_key = trie_get_next_key,
@@ -513,15 +513,3 @@ static const struct bpf_map_ops trie_ops = {
        .map_update_elem = trie_update_elem,
        .map_delete_elem = trie_delete_elem,
 };
-static struct bpf_map_type_list trie_type __ro_after_init = {
-        .ops = &trie_ops,
-        .type = BPF_MAP_TYPE_LPM_TRIE,
-};
-static int __init register_trie_map(void)
-{
-        bpf_register_map_type(&trie_type);
-        return 0;
-}
-late_initcall(register_trie_map);
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
new file mode 100644
index 000000000000..59bcdf821ae4
--- /dev/null
+++ b/kernel/bpf/map_in_map.c
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include "map_in_map.h"
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
+{
+        struct bpf_map *inner_map, *inner_map_meta;
+        struct fd f;
+        f = fdget(inner_map_ufd);
+        inner_map = __bpf_map_get(f);
+        if (IS_ERR(inner_map))
+                return inner_map;
+        /* prog_array->owner_prog_type and owner_jited
+         * is a runtime binding.  Doing static check alone
+         * in the verifier is not enough.
+         */
+        if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+                fdput(f);
+                return ERR_PTR(-ENOTSUPP);
+        }
+        /* Does not support >1 level map-in-map */
+        if (inner_map->inner_map_meta) {
+                fdput(f);
+                return ERR_PTR(-EINVAL);
+        }
+        inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
+        if (!inner_map_meta) {
+                fdput(f);
+                return ERR_PTR(-ENOMEM);
+        }
+        inner_map_meta->map_type = inner_map->map_type;
+        inner_map_meta->key_size = inner_map->key_size;
+        inner_map_meta->value_size = inner_map->value_size;
+        inner_map_meta->map_flags = inner_map->map_flags;
+        inner_map_meta->ops = inner_map->ops;
+        inner_map_meta->max_entries = inner_map->max_entries;
+        fdput(f);
+        return inner_map_meta;
+}
+void bpf_map_meta_free(struct bpf_map *map_meta)
+{
+        kfree(map_meta);
+}
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+                        const struct bpf_map *meta1)
+{
+        /* No need to compare ops because it is covered by map_type */
+        return meta0->map_type == meta1->map_type &&
+                meta0->key_size == meta1->key_size &&
+                meta0->value_size == meta1->value_size &&
+                meta0->map_flags == meta1->map_flags &&
+                meta0->max_entries == meta1->max_entries;
+}
+void *bpf_map_fd_get_ptr(struct bpf_map *map,
+                         struct file *map_file /* not used */,
+                         int ufd)
+{
+        struct bpf_map *inner_map;
+        struct fd f;
+        f = fdget(ufd);
+        inner_map = __bpf_map_get(f);
+        if (IS_ERR(inner_map))
+                return inner_map;
+        if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
+                inner_map = bpf_map_inc(inner_map, false);
+        else
+                inner_map = ERR_PTR(-EINVAL);
+        fdput(f);
+        return inner_map;
+}
+void bpf_map_fd_put_ptr(void *ptr)
+{
+        /* ptr->ops->map_free() has to go through one
+         * rcu grace period by itself.
+         */
+        bpf_map_put(ptr);
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
new file mode 100644
index 000000000000..177fadb689dc
--- /dev/null
+++ b/kernel/bpf/map_in_map.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __MAP_IN_MAP_H__
+#define __MAP_IN_MAP_H__
+#include <linux/types.h>
+struct file;
+struct bpf_map;
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
+void bpf_map_meta_free(struct bpf_map *map_meta);
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+                        const struct bpf_map *meta1);
+void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
+                         int ufd);
+void bpf_map_fd_put_ptr(void *ptr);
+#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 22aa45cd0324..4dfd6f2ec2f9 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -264,7 +264,7 @@ static void stack_map_free(struct bpf_map *map)
        put_callchain_buffers();
 }
-static const struct bpf_map_ops stack_map_ops = {
+const struct bpf_map_ops stack_map_ops = {
        .map_alloc = stack_map_alloc,
        .map_free = stack_map_free,
        .map_get_next_key = stack_map_get_next_key,
@@ -272,15 +272,3 @@ static const struct bpf_map_ops stack_map_ops = {
        .map_update_elem = stack_map_update_elem,
        .map_delete_elem = stack_map_delete_elem,
 };
-static struct bpf_map_type_list stack_map_type __ro_after_init = {
-        .ops = &stack_map_ops,
-        .type = BPF_MAP_TYPE_STACK_TRACE,
-};
-static int __init register_stack_map(void)
-{
-        bpf_register_map_type(&stack_map_type);
-        return 0;
-}
-late_initcall(register_stack_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 821f9e807de5..265a0d854e33 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active);
 int sysctl_unprivileged_bpf_disabled __read_mostly;
-static LIST_HEAD(bpf_map_types);
+static const struct bpf_map_ops * const bpf_map_types[] = {
+#define BPF_PROG_TYPE(_id, _ops)
+#define BPF_MAP_TYPE(_id, _ops) \
+        [_id] = &_ops,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 {
-        struct bpf_map_type_list *tl;
        struct bpf_map *map;
-        list_for_each_entry(tl, &bpf_map_types, list_node) {
+        if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
-                if (tl->type == attr->map_type) {
+            !bpf_map_types[attr->map_type])
-                        map = tl->ops->map_alloc(attr);
+                return ERR_PTR(-EINVAL);
-                        if (IS_ERR(map))
-                                return map;
-                        map->ops = tl->ops;
-                        map->map_type = attr->map_type;
-                        return map;
-                }
-        }
-        return ERR_PTR(-EINVAL);
-}
-/* boot time registration of different map implementations */
+        map = bpf_map_types[attr->map_type]->map_alloc(attr);
-void bpf_register_map_type(struct bpf_map_type_list *tl)
+        if (IS_ERR(map))
-{
+                return map;
-        list_add(&tl->list_node, &bpf_map_types);
+        map->ops = bpf_map_types[attr->map_type];
+        map->map_type = attr->map_type;
+        return map;
 }
 void *bpf_map_area_alloc(size_t size)
@@ -68,8 +67,7 @@ void *bpf_map_area_alloc(size_t size)
                        return area;
        }
-        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags,
+        return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
-                         PAGE_KERNEL);
 }
 void bpf_map_area_free(void *area)
@@ -215,7 +213,7 @@ int bpf_map_new_fd(struct bpf_map *map)
                   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
                   sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD map_flags
+#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -352,6 +350,9 @@ static int map_lookup_elem(union bpf_attr *attr)
                err = bpf_percpu_array_copy(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
                err = bpf_stackmap_copy(map, key, value);
+        } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+                   map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+                err = -ENOTSUPP;
        } else {
                rcu_read_lock();
                ptr = map->ops->map_lookup_elem(map, key);
@@ -438,11 +439,17 @@ static int map_update_elem(union bpf_attr *attr)
                err = bpf_percpu_array_update(map, key, value, attr->flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
                   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
-                   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+                   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
+                   map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
                rcu_read_lock();
                err = bpf_fd_array_map_update_elem(map, f.file, key, value,
                                                   attr->flags);
                rcu_read_unlock();
+        } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+                rcu_read_lock();
+                err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+                                                  attr->flags);
+                rcu_read_unlock();
        } else {
                rcu_read_lock();
                err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -528,14 +535,18 @@ static int map_get_next_key(union bpf_attr *attr)
        if (IS_ERR(map))
                return PTR_ERR(map);
-        err = -ENOMEM;
+        if (ukey) {
-        key = kmalloc(map->key_size, GFP_USER);
+                err = -ENOMEM;
-        if (!key)
+                key = kmalloc(map->key_size, GFP_USER);
-                goto err_put;
+                if (!key)
+                        goto err_put;
-        err = -EFAULT;
+                err = -EFAULT;
-        if (copy_from_user(key, ukey, map->key_size) != 0)
+                if (copy_from_user(key, ukey, map->key_size) != 0)
-                goto free_key;
+                        goto free_key;
+        } else {
+                key = NULL;
+        }
        err = -ENOMEM;
        next_key = kmalloc(map->key_size, GFP_USER);
@@ -564,87 +575,23 @@ err_put:
        return err;
 }
-static LIST_HEAD(bpf_prog_types);
+static const struct bpf_verifier_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _ops) \
+        [_id] = &_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 {
-        struct bpf_prog_type_list *tl;
+        if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
+                return -EINVAL;
-        list_for_each_entry(tl, &bpf_prog_types, list_node) {
-                if (tl->type == type) {
-                        prog->aux->ops = tl->ops;
-                        prog->type = type;
-                        return 0;
-                }
-        }
-        return -EINVAL;
-}
-void bpf_register_prog_type(struct bpf_prog_type_list *tl)
-{
-        list_add(&tl->list_node, &bpf_prog_types);
-}
-/* fixup insn->imm field of bpf_call instructions:
- * if (insn->imm == BPF_FUNC_map_lookup_elem)
- *      insn->imm = bpf_map_lookup_elem - __bpf_call_base;
- * else if (insn->imm == BPF_FUNC_map_update_elem)
- *      insn->imm = bpf_map_update_elem - __bpf_call_base;
- * else ...
- *
- * this function is called after eBPF program passed verification
- */
-static void fixup_bpf_calls(struct bpf_prog *prog)
-{
-        const struct bpf_func_proto *fn;
-        int i;
-        for (i = 0; i < prog->len; i++) {
+        prog->aux->ops = bpf_prog_types[type];
-                struct bpf_insn *insn = &prog->insnsi[i];
+        prog->type = type;
+        return 0;
-                if (insn->code == (BPF_JMP | BPF_CALL)) {
-                        /* we reach here when program has bpf_call instructions
-                         * and it passed bpf_check(), means that
-                         * ops->get_func_proto must have been supplied, check it
-                         */
-                        BUG_ON(!prog->aux->ops->get_func_proto);
-                        if (insn->imm == BPF_FUNC_get_route_realm)
-                                prog->dst_needed = 1;
-                        if (insn->imm == BPF_FUNC_get_prandom_u32)
-                                bpf_user_rnd_init_once();
-                        if (insn->imm == BPF_FUNC_xdp_adjust_head)
-                                prog->xdp_adjust_head = 1;
-                        if (insn->imm == BPF_FUNC_tail_call) {
-                                /* If we tail call into other programs, we
-                                 * cannot make any assumptions since they
-                                 * can be replaced dynamically during runtime
-                                 * in the program array.
-                                 */
-                                prog->cb_access = 1;
-                                prog->xdp_adjust_head = 1;
-                                /* mark bpf_tail_call as different opcode
-                                 * to avoid conditional branch in
-                                 * interpeter for every normal call
-                                 * and to prevent accidental JITing by
-                                 * JIT compiler that doesn't support
-                                 * bpf_tail_call yet
-                                 */
-                                insn->imm = 0;
-                                insn->code |= BPF_X;
-                                continue;
-                        }
-                        fn = prog->aux->ops->get_func_proto(insn->imm);
-                        /* all functions that have prototype and verifier allowed
-                         * programs to call them, must be real in-kernel functions
-                         */
-                        BUG_ON(!fn->func);
-                        insn->imm = fn->func - __bpf_call_base;
-                }
-        }
 }
 /* drop refcnt on maps used by eBPF program and free auxilary data */
@@ -836,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 /* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD kern_version
+#define BPF_PROG_LOAD_LAST_FIELD prog_flags
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -849,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr)
        if (CHECK_ATTR(BPF_PROG_LOAD))
                return -EINVAL;
+        if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+                return -EINVAL;
        /* copy eBPF program license from user space */
        if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
                              sizeof(license) - 1) < 0)
@@ -900,9 +850,6 @@ static int bpf_prog_load(union bpf_attr *attr)
        if (err < 0)
                goto free_used_maps;
-        /* fixup BPF_CALL->imm field */
-        fixup_bpf_calls(prog);
        /* eBPF program is ready to be JITed */
        prog = bpf_prog_select_runtime(prog, &err);
        if (err < 0)
@@ -1028,6 +975,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 }
 #endif /* CONFIG_CGROUP_BPF */
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+static int bpf_prog_test_run(const union bpf_attr *attr,
+                             union bpf_attr __user *uattr)
+{
+        struct bpf_prog *prog;
+        int ret = -ENOTSUPP;
+        if (CHECK_ATTR(BPF_PROG_TEST_RUN))
+                return -EINVAL;
+        prog = bpf_prog_get(attr->test.prog_fd);
+        if (IS_ERR(prog))
+                return PTR_ERR(prog);
+        if (prog->aux->ops->test_run)
+                ret = prog->aux->ops->test_run(prog, attr, uattr);
+        bpf_prog_put(prog);
+        return ret;
+}
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
        union bpf_attr attr = {};
@@ -1094,7 +1063,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
        case BPF_OBJ_GET:
                err = bpf_obj_get(&attr);
                break;
 #ifdef CONFIG_CGROUP_BPF
        case BPF_PROG_ATTACH:
                err = bpf_prog_attach(&attr);
@@ -1103,7 +1071,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
                err = bpf_prog_detach(&attr);
                break;
 #endif
+        case BPF_PROG_TEST_RUN:
+                err = bpf_prog_test_run(&attr, uattr);
+                break;
        default:
                err = -EINVAL;
                break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a834068a400e..1eddb713b815 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -140,9 +140,11 @@ struct bpf_verifier_stack_elem {
        struct bpf_verifier_stack_elem *next;
 };
-#define BPF_COMPLEXITY_LIMIT_INSNS      65536
+#define BPF_COMPLEXITY_LIMIT_INSNS      98304
 #define BPF_COMPLEXITY_LIMIT_STACK      1024
+#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
 struct bpf_call_arg_meta {
        struct bpf_map *map_ptr;
        bool raw_mode;
@@ -239,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state)
                if (reg->max_value != BPF_REGISTER_MAX_RANGE)
                        verbose(",max_value=%llu",
                                (unsigned long long)reg->max_value);
+                if (reg->min_align)
+                        verbose(",min_align=%u", reg->min_align);
+                if (reg->aux_off)
+                        verbose(",aux_off=%u", reg->aux_off);
+                if (reg->aux_off_align)
+                        verbose(",aux_off_align=%u", reg->aux_off_align);
        }
        for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                if (state->stack_slot_type[i] == STACK_SPILL)
@@ -296,7 +304,8 @@ static const char *const bpf_jmp_string[16] = {
        [BPF_EXIT >> 4] = "exit",
 };
-static void print_bpf_insn(struct bpf_insn *insn)
+static void print_bpf_insn(const struct bpf_verifier_env *env,
+                           const struct bpf_insn *insn)
 {
        u8 class = BPF_CLASS(insn->code);
@@ -360,9 +369,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
                                insn->code,
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->src_reg, insn->imm);
-                } else if (BPF_MODE(insn->code) == BPF_IMM) {
+                } else if (BPF_MODE(insn->code) == BPF_IMM &&
-                        verbose("(%02x) r%d = 0x%x\n",
+                           BPF_SIZE(insn->code) == BPF_DW) {
-                                insn->code, insn->dst_reg, insn->imm);
+                        /* At this point, we already made sure that the second
+                         * part of the ldimm64 insn is accessible.
+                         */
+                        u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+                        bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+                        if (map_ptr && !env->allow_ptr_leaks)
+                                imm = 0;
+                        verbose("(%02x) r%d = 0x%llx\n", insn->code,
+                                insn->dst_reg, (unsigned long long)imm);
                } else {
                        verbose("BUG_ld_%02x\n", insn->code);
                        return;
@@ -453,6 +472,9 @@ static void init_reg_state(struct bpf_reg_state *regs)
                regs[i].imm = 0;
                regs[i].min_value = BPF_REGISTER_MIN_RANGE;
                regs[i].max_value = BPF_REGISTER_MAX_RANGE;
+                regs[i].min_align = 0;
+                regs[i].aux_off = 0;
+                regs[i].aux_off_align = 0;
        }
        /* frame pointer */
@@ -479,6 +501,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
 {
        regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
        regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+        regs[regno].min_align = 0;
 }
 static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
@@ -766,17 +789,33 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 }
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
-                                   int off, int size)
+                                   int off, int size, bool strict)
 {
-        if (reg->id && size != 1) {
+        int ip_align;
-                verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
+        int reg_off;
-                return -EACCES;
+        /* Byte size accesses are always allowed. */
+        if (!strict || size == 1)
+                return 0;
+        reg_off = reg->off;
+        if (reg->id) {
+                if (reg->aux_off_align % size) {
+                        verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
+                                reg->aux_off_align, size);
+                        return -EACCES;
+                }
+                reg_off += reg->aux_off;
        }
-        /* skb->data is NET_IP_ALIGN-ed */
+        /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking
-        if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+         * we force this to 2 which is universally what architectures use
+         * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+         */
+        ip_align = strict ? 2 : NET_IP_ALIGN;
+        if ((ip_align + reg_off + off) % size != 0) {
                verbose("misaligned packet access off %d+%d+%d size %d\n",
-                        NET_IP_ALIGN, reg->off, off, size);
+                        ip_align, reg_off, off, size);
                return -EACCES;
        }
@@ -784,9 +823,9 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 }
 static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
-                                   int size)
+                                   int size, bool strict)
 {
-        if (size != 1) {
+        if (strict && size != 1) {
                verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
                return -EACCES;
        }
@@ -794,16 +833,20 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
        return 0;
 }
-static int check_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+                               const struct bpf_reg_state *reg,
                               int off, int size)
 {
+        bool strict = env->strict_alignment;
+        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+                strict = true;
        switch (reg->type) {
        case PTR_TO_PACKET:
-                return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+                return check_pkt_ptr_alignment(reg, off, size, strict);
-                       check_pkt_ptr_alignment(reg, off, size);
        case PTR_TO_MAP_VALUE_ADJ:
-                return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+                return check_val_ptr_alignment(reg, size, strict);
-                       check_val_ptr_alignment(reg, size);
        default:
                if (off % size != 0) {
                        verbose("misaligned access off %d size %d\n",
@@ -836,7 +879,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
        if (size < 0)
                return size;
-        err = check_ptr_alignment(reg, off, size);
+        err = check_ptr_alignment(env, reg, off, size);
        if (err)
                return err;
@@ -870,6 +913,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
                                                         value_regno);
                        /* note that reg.[id|off|range] == 0 */
                        state->regs[value_regno].type = reg_type;
+                        state->regs[value_regno].aux_off = 0;
+                        state->regs[value_regno].aux_off_align = 0;
                }
        } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -1215,6 +1260,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                    func_id != BPF_FUNC_current_task_under_cgroup)
                        goto error;
                break;
+        case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+        case BPF_MAP_TYPE_HASH_OF_MAPS:
+                if (func_id != BPF_FUNC_map_lookup_elem)
+                        goto error;
        default:
                break;
        }
@@ -1291,7 +1340,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
        }
 }
-static int check_call(struct bpf_verifier_env *env, int func_id)
+static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
        struct bpf_verifier_state *state = &env->cur_state;
        const struct bpf_func_proto *fn = NULL;
@@ -1375,6 +1424,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
        } else if (fn->ret_type == RET_VOID) {
                regs[BPF_REG_0].type = NOT_INIT;
        } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+                struct bpf_insn_aux_data *insn_aux;
                regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
                regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
                /* remember map_ptr, so that check_map_access()
@@ -1387,6 +1438,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
                }
                regs[BPF_REG_0].map_ptr = meta.map_ptr;
                regs[BPF_REG_0].id = ++env->id_gen;
+                insn_aux = &env->insn_aux_data[insn_idx];
+                if (!insn_aux->map_ptr)
+                        insn_aux->map_ptr = meta.map_ptr;
+                else if (insn_aux->map_ptr != meta.map_ptr)
+                        insn_aux->map_ptr = BPF_MAP_PTR_POISON;
        } else {
                verbose("unknown return type %d of func %s#%d\n",
                        fn->ret_type, func_id_name(func_id), func_id);
@@ -1431,6 +1487,8 @@ add_imm:
                 */
                dst_reg->off += imm;
        } else {
+                bool had_id;
                if (src_reg->type == PTR_TO_PACKET) {
                        /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
                        tmp_reg = *dst_reg;  /* save r7 state */
@@ -1464,14 +1522,23 @@ add_imm:
                                src_reg->imm);
                        return -EACCES;
                }
+                had_id = (dst_reg->id != 0);
                /* dst_reg stays as pkt_ptr type and since some positive
                 * integer value was added to the pointer, increment its 'id'
                 */
                dst_reg->id = ++env->id_gen;
-                /* something was added to pkt_ptr, set range and off to zero */
+                /* something was added to pkt_ptr, set range to zero */
+                dst_reg->aux_off += dst_reg->off;
                dst_reg->off = 0;
                dst_reg->range = 0;
+                if (had_id)
+                        dst_reg->aux_off_align = min(dst_reg->aux_off_align,
+                                                     src_reg->min_align);
+                else
+                        dst_reg->aux_off_align = src_reg->min_align;
        }
        return 0;
 }
@@ -1645,6 +1712,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
                reg->min_value = BPF_REGISTER_MIN_RANGE;
 }
+static u32 calc_align(u32 imm)
+{
+        if (!imm)
+                return 1U << 31;
+        return imm - ((imm - 1) & imm);
+}
 static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                                    struct bpf_insn *insn)
 {
@@ -1652,8 +1726,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
        s64 min_val = BPF_REGISTER_MIN_RANGE;
        u64 max_val = BPF_REGISTER_MAX_RANGE;
        u8 opcode = BPF_OP(insn->code);
+        u32 dst_align, src_align;
        dst_reg = &regs[insn->dst_reg];
+        src_align = 0;
        if (BPF_SRC(insn->code) == BPF_X) {
                check_reg_overflow(&regs[insn->src_reg]);
                min_val = regs[insn->src_reg].min_value;
@@ -1669,12 +1745,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                    regs[insn->src_reg].type != UNKNOWN_VALUE) {
                        min_val = BPF_REGISTER_MIN_RANGE;
                        max_val = BPF_REGISTER_MAX_RANGE;
+                        src_align = 0;
+                } else {
+                        src_align = regs[insn->src_reg].min_align;
                }
        } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
                   (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
                min_val = max_val = insn->imm;
+                src_align = calc_align(insn->imm);
        }
+        dst_align = dst_reg->min_align;
        /* We don't know anything about what was done to this register, mark it
         * as unknown.
         */
@@ -1699,18 +1781,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                        dst_reg->min_value += min_val;
                if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
                        dst_reg->max_value += max_val;
+                dst_reg->min_align = min(src_align, dst_align);
                break;
        case BPF_SUB:
                if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
                        dst_reg->min_value -= min_val;
                if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
                        dst_reg->max_value -= max_val;
+                dst_reg->min_align = min(src_align, dst_align);
                break;
        case BPF_MUL:
                if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
                        dst_reg->min_value *= min_val;
                if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
                        dst_reg->max_value *= max_val;
+                dst_reg->min_align = max(src_align, dst_align);
                break;
        case BPF_AND:
                /* Disallow AND'ing of negative numbers, ain't nobody got time
@@ -1722,17 +1807,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                else
                        dst_reg->min_value = 0;
                dst_reg->max_value = max_val;
+                dst_reg->min_align = max(src_align, dst_align);
                break;
        case BPF_LSH:
                /* Gotta have special overflow logic here, if we're shifting
                 * more than MAX_RANGE then just assume we have an invalid
                 * range.
                 */
-                if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
+                if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
                        dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-                else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+                        dst_reg->min_align = 1;
-                        dst_reg->min_value <<= min_val;
+                } else {
+                        if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+                                dst_reg->min_value <<= min_val;
+                        if (!dst_reg->min_align)
+                                dst_reg->min_align = 1;
+                        dst_reg->min_align <<= min_val;
+                }
                if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
                        dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
                else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
@@ -1742,11 +1833,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                /* RSH by a negative number is undefined, and the BPF_RSH is an
                 * unsigned shift, so make the appropriate casts.
                 */
-                if (min_val < 0 || dst_reg->min_value < 0)
+                if (min_val < 0 || dst_reg->min_value < 0) {
                        dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-                else
+                } else {
                        dst_reg->min_value =
                                (u64)(dst_reg->min_value) >> min_val;
+                }
+                if (min_val < 0) {
+                        dst_reg->min_align = 1;
+                } else {
+                        dst_reg->min_align >>= (u64) min_val;
+                        if (!dst_reg->min_align)
+                                dst_reg->min_align = 1;
+                }
                if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
                        dst_reg->max_value >>= max_val;
                break;
@@ -1848,6 +1947,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                        regs[insn->dst_reg].imm = insn->imm;
                        regs[insn->dst_reg].max_value = insn->imm;
                        regs[insn->dst_reg].min_value = insn->imm;
+                        regs[insn->dst_reg].min_align = calc_align(insn->imm);
                }
        } else if (opcode > BPF_END) {
@@ -1911,6 +2011,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                        return 0;
                } else if (opcode == BPF_ADD &&
                           BPF_CLASS(insn->code) == BPF_ALU64 &&
+                           dst_reg->type == PTR_TO_STACK &&
+                           ((BPF_SRC(insn->code) == BPF_X &&
+                             regs[insn->src_reg].type == CONST_IMM) ||
+                            BPF_SRC(insn->code) == BPF_K)) {
+                        if (BPF_SRC(insn->code) == BPF_X)
+                                dst_reg->imm += regs[insn->src_reg].imm;
+                        else
+                                dst_reg->imm += insn->imm;
+                        return 0;
+                } else if (opcode == BPF_ADD &&
+                           BPF_CLASS(insn->code) == BPF_ALU64 &&
                           (dst_reg->type == PTR_TO_PACKET ||
                            (BPF_SRC(insn->code) == BPF_X &&
                             regs[insn->src_reg].type == PTR_TO_PACKET))) {
@@ -2112,14 +2223,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
        struct bpf_reg_state *reg = &regs[regno];
        if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
-                reg->type = type;
+                if (type == UNKNOWN_VALUE) {
+                        __mark_reg_unknown_value(regs, regno);
+                } else if (reg->map_ptr->inner_map_meta) {
+                        reg->type = CONST_PTR_TO_MAP;
+                        reg->map_ptr = reg->map_ptr->inner_map_meta;
+                } else {
+                        reg->type = type;
+                }
                /* We don't need id from this point onwards anymore, thus we
                 * should better reset it, so that state pruning has chances
                 * to take effect.
                 */
                reg->id = 0;
-                if (type == UNKNOWN_VALUE)
-                        __mark_reg_unknown_value(regs, regno);
        }
 }
@@ -2524,6 +2640,7 @@ peek_stack:
                                env->explored_states[t + 1] = STATE_LIST_MARK;
                } else {
                        /* conditional jump with two edges */
+                        env->explored_states[t] = STATE_LIST_MARK;
                        ret = push_insn(t, t + 1, FALLTHROUGH, env);
                        if (ret == 1)
                                goto peek_stack;
@@ -2682,6 +2799,12 @@ static bool states_equal(struct bpf_verifier_env *env,
                     rcur->type != NOT_INIT))
                        continue;
+                /* Don't care about the reg->id in this case. */
+                if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
+                    rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
+                    rold->map_ptr == rcur->map_ptr)
+                        continue;
                if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
                    compare_ptrs_to_packet(rold, rcur))
                        continue;
@@ -2816,15 +2939,22 @@ static int do_check(struct bpf_verifier_env *env)
                        goto process_bpf_exit;
                }
-                if (log_level && do_print_state) {
+                if (need_resched())
-                        verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+                        cond_resched();
+                if (log_level > 1 || (log_level && do_print_state)) {
+                        if (log_level > 1)
+                                verbose("%d:", insn_idx);
+                        else
+                                verbose("\nfrom %d to %d:",
+                                        prev_insn_idx, insn_idx);
                        print_verifier_state(&env->cur_state);
                        do_print_state = false;
                }
                if (log_level) {
                        verbose("%d: ", insn_idx);
-                        print_bpf_insn(insn);
+                        print_bpf_insn(env, insn);
                }
                err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
@@ -2960,7 +3090,7 @@ static int do_check(struct bpf_verifier_env *env)
                                        return -EINVAL;
                                }
-                                err = check_call(env, insn->imm);
+                                err = check_call(env, insn->imm, insn_idx);
                                if (err)
                                        return err;
@@ -3044,16 +3174,33 @@ process_bpf_exit:
        return 0;
 }
+static int check_map_prealloc(struct bpf_map *map)
+{
+        return (map->map_type != BPF_MAP_TYPE_HASH &&
+                map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+                map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
+                !(map->map_flags & BPF_F_NO_PREALLOC);
+}
 static int check_map_prog_compatibility(struct bpf_map *map,
                                        struct bpf_prog *prog)
 {
-        if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+        /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
-            (map->map_type == BPF_MAP_TYPE_HASH ||
+         * preallocated hash maps, since doing memory allocation
-             map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
+         * in overflow_handler can crash depending on where nmi got
-            (map->map_flags & BPF_F_NO_PREALLOC)) {
+         * triggered.
-                verbose("perf_event programs can only use preallocated hash map\n");
+         */
-                return -EINVAL;
+        if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
+                if (!check_map_prealloc(map)) {
+                        verbose("perf_event programs can only use preallocated hash map\n");
+                        return -EINVAL;
+                }
+                if (map->inner_map_meta &&
+                    !check_map_prealloc(map->inner_map_meta)) {
+                        verbose("perf_event programs can only use preallocated inner hash map\n");
+                        return -EINVAL;
+                }
        }
        return 0;
 }
@@ -3182,6 +3329,41 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
                        insn->src_reg = 0;
 }
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt).  Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
+                                u32 off, u32 cnt)
+{
+        struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+        if (cnt == 1)
+                return 0;
+        new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
+        if (!new_data)
+                return -ENOMEM;
+        memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
+        memcpy(new_data + off + cnt - 1, old_data + off,
+               sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+        env->insn_aux_data = new_data;
+        vfree(old_data);
+        return 0;
+}
+static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
+                                            const struct bpf_insn *patch, u32 len)
+{
+        struct bpf_prog *new_prog;
+        new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+        if (!new_prog)
+                return NULL;
+        if (adjust_insn_aux_data(env, new_prog->len, off, len))
+                return NULL;
+        return new_prog;
+}
 /* convert load instructions that access fields of 'struct __sk_buff'
 * into sequence of instructions that access fields of 'struct sk_buff'
 */
@@ -3201,10 +3383,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                        verbose("bpf verifier is misconfigured\n");
                        return -EINVAL;
                } else if (cnt) {
-                        new_prog = bpf_patch_insn_single(env->prog, 0,
+                        new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
-                                                         insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;
                        env->prog = new_prog;
                        delta += cnt - 1;
                }
@@ -3229,7 +3411,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                else
                        continue;
-                if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
+                if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
                        continue;
                cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
@@ -3238,8 +3420,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                        return -EINVAL;
                }
-                new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
+                new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
-                                                 cnt);
                if (!new_prog)
                        return -ENOMEM;
@@ -3253,6 +3434,89 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
        return 0;
 }
+/* fixup insn->imm field of bpf_call instructions
+ * and inline eligible helpers as explicit sequence of BPF instructions
+ *
+ * this function is called after eBPF program passed verification
+ */
+static int fixup_bpf_calls(struct bpf_verifier_env *env)
+{
+        struct bpf_prog *prog = env->prog;
+        struct bpf_insn *insn = prog->insnsi;
+        const struct bpf_func_proto *fn;
+        const int insn_cnt = prog->len;
+        struct bpf_insn insn_buf[16];
+        struct bpf_prog *new_prog;
+        struct bpf_map *map_ptr;
+        int i, cnt, delta = 0;
+        for (i = 0; i < insn_cnt; i++, insn++) {
+                if (insn->code != (BPF_JMP | BPF_CALL))
+                        continue;
+                if (insn->imm == BPF_FUNC_get_route_realm)
+                        prog->dst_needed = 1;
+                if (insn->imm == BPF_FUNC_get_prandom_u32)
+                        bpf_user_rnd_init_once();
+                if (insn->imm == BPF_FUNC_tail_call) {
+                        /* If we tail call into other programs, we
+                         * cannot make any assumptions since they can
+                         * be replaced dynamically during runtime in
+                         * the program array.
+                         */
+                        prog->cb_access = 1;
+                        /* mark bpf_tail_call as different opcode to avoid
+                         * conditional branch in the interpeter for every normal
+                         * call and to prevent accidental JITing by JIT compiler
+                         * that doesn't support bpf_tail_call yet
+                         */
+                        insn->imm = 0;
+                        insn->code |= BPF_X;
+                        continue;
+                }
+                if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
+                        map_ptr = env->insn_aux_data[i + delta].map_ptr;
+                        if (map_ptr == BPF_MAP_PTR_POISON ||
+                            !map_ptr->ops->map_gen_lookup)
+                                goto patch_call_imm;
+                        cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
+                        if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+                                verbose("bpf verifier is misconfigured\n");
+                                return -EINVAL;
+                        }
+                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+                                                       cnt);
+                        if (!new_prog)
+                                return -ENOMEM;
+                        delta += cnt - 1;
+                        /* keep walking new program and skip insns we just inserted */
+                        env->prog = prog = new_prog;
+                        insn      = new_prog->insnsi + i + delta;
+                        continue;
+                }
+patch_call_imm:
+                fn = prog->aux->ops->get_func_proto(insn->imm);
+                /* all functions that have prototype and verifier allowed
+                 * programs to call them, must be real in-kernel functions
+                 */
+                if (!fn->func) {
+                        verbose("kernel subsystem misconfigured func %s#%d\n",
+                                func_id_name(insn->imm), insn->imm);
+                        return -EFAULT;
+                }
+                insn->imm = fn->func - __bpf_call_base;
+        }
+        return 0;
+}
 static void free_states(struct bpf_verifier_env *env)
 {
        struct bpf_verifier_state_list *sl, *sln;
@@ -3320,6 +3584,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
        } else {
                log_level = 0;
        }
+        if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT)
+                env->strict_alignment = true;
+        else
+                env->strict_alignment = false;
        ret = replace_map_fd_with_map_ptr(env);
        if (ret < 0)
@@ -3348,6 +3616,9 @@ skip_full_check:
                /* program is valid, convert *(u32*)(ctx + off) accesses */
                ret = convert_ctx_accesses(env);
+        if (ret == 0)
+                ret = fixup_bpf_calls(env);
        if (log_level && log_len >= log_size - 1) {
                BUG_ON(log_len >= log_size);
                /* verifier log exceeded user supplied buffer */
@@ -3422,6 +3693,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
        mutex_lock(&bpf_verifier_lock);
        log_level = 0;
+        env->strict_alignment = false;
        env->explored_states = kcalloc(env->prog->len,
                                       sizeof(struct bpf_verifier_state_list *),
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 9203bfb05603..00f4d6bf048f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -5,6 +5,7 @@
 #include <linux/kernfs.h>
 #include <linux/workqueue.h>
 #include <linux/list.h>
+#include <linux/refcount.h>
 /*
 * A cgroup can be associated with multiple css_sets as different tasks may
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset)
         * can see it. Similar to atomic_dec_and_lock(), but for an
         * rwlock
         */
-        if (atomic_add_unless(&cset->refcount, -1, 1))
+        if (refcount_dec_not_one(&cset->refcount))
                return;
        spin_lock_irqsave(&css_set_lock, flags);
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset)
 */
 static inline void get_css_set(struct css_set *cset)
 {
-        atomic_inc(&cset->refcount);
+        refcount_inc(&cset->refcount);
 }
 bool cgroup_ssid_enabled(int ssid);
@@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
 void cgroup_free_root(struct cgroup_root *root);
 void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
 struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
                               struct cgroup_root *root, unsigned long magic,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1dc22f6b49f5..85d75152402d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
-                count += atomic_read(&link->cset->refcount);
+                count += refcount_read(&link->cset->refcount);
        spin_unlock_irq(&css_set_lock);
        return count;
 }
@@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
        struct cgroup_subsys *ss;
        struct dentry *dentry;
        int i, ret;
+        bool new_root = false;
        cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
                ret = -ENOMEM;
                goto out_unlock;
        }
+        new_root = true;
        init_cgroup_root(root, &opts);
-        ret = cgroup_setup_root(root, opts.subsys_mask);
+        ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
        if (ret)
                cgroup_free_root(root);
@@ -1201,6 +1203,18 @@ out_free:
                                 CGROUP_SUPER_MAGIC, ns);
        /*
+         * There's a race window after we release cgroup_mutex and before
+         * allocating a superblock. Make sure a concurrent process won't
+         * be able to re-use the root during this window by delaying the
+         * initialization of root refcnt.
+         */
+        if (new_root) {
+                mutex_lock(&cgroup_mutex);
+                percpu_ref_reinit(&root->cgrp.self.refcnt);
+                mutex_unlock(&cgroup_mutex);
+        }
+        /*
         * If @pinned_sb, we're reusing an existing root and holding an
         * extra ref on its sb.  Mount is complete.  Put the extra ref.
         */
@@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
        u64 count;
        rcu_read_lock();
-        count = atomic_read(&task_css_set(current)->refcount);
+        count = refcount_read(&task_css_set(current)->refcount);
        rcu_read_unlock();
        return count;
 }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 687f5e0194ef..c3c9a0e1b3c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly;
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-        .count          = { .counter = 2, },
+        .count          = REFCOUNT_INIT(2),
        .user_ns        = &init_user_ns,
        .ns.ops         = &cgroupns_operations,
        .ns.inum        = PROC_CGROUP_INIT_INO,
@@ -436,7 +436,12 @@ out_unlock:
        return css;
 }
-static void cgroup_get(struct cgroup *cgrp)
+static void __maybe_unused cgroup_get(struct cgroup *cgrp)
+{
+        css_get(&cgrp->self);
+}
+static void cgroup_get_live(struct cgroup *cgrp)
 {
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
        css_get(&cgrp->self);
@@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css);
 * haven't been created.
 */
 struct css_set init_css_set = {
-        .refcount               = ATOMIC_INIT(1),
+        .refcount               = REFCOUNT_INIT(1),
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
@@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset)
        lockdep_assert_held(&css_set_lock);
-        if (!atomic_dec_and_test(&cset->refcount))
+        if (!refcount_dec_and_test(&cset->refcount))
                return;
        /* This css_set is dead. unlink it and release cgroup and css refs */
@@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);
        if (cgroup_parent(cgrp))
-                cgroup_get(cgrp);
+                cgroup_get_live(cgrp);
 }
 /**
@@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
                return NULL;
        }
-        atomic_set(&cset->refcount, 1);
+        refcount_set(&cset->refcount, 1);
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
@@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
@@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
        root_cgrp->id = ret;
        root_cgrp->ancestor_ids[0] = ret;
-        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
-                              GFP_KERNEL);
+                              ref_flags, GFP_KERNEL);
        if (ret)
                goto out;
@@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                        return ERR_PTR(-EINVAL);
                }
                cgrp_dfl_visible = true;
-                cgroup_get(&cgrp_dfl_root.cgrp);
+                cgroup_get_live(&cgrp_dfl_root.cgrp);
                dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
                                         CGROUP2_SUPER_MAGIC, ns);
@@ -2576,7 +2581,7 @@ restart:
                        if (!css || !percpu_ref_is_dying(&css->refcnt))
                                continue;
-                        cgroup_get(dsct);
+                        cgroup_get_live(dsct);
                        prepare_to_wait(&dsct->offline_waitq, &wait,
                                        TASK_UNINTERRUPTIBLE);
@@ -3947,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 {
        lockdep_assert_held(&cgroup_mutex);
-        cgroup_get(cgrp);
+        cgroup_get_live(cgrp);
        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
@@ -4123,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
        /* allocation complete, commit to creation */
        list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
        atomic_inc(&root->nr_cgrps);
-        cgroup_get(parent);
+        cgroup_get_live(parent);
        /*
         * @cgrp is now fully operational.  If something fails after this
@@ -4513,7 +4518,7 @@ int __init cgroup_init(void)
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));
-        BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+        BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
        mutex_unlock(&cgroup_mutex);
@@ -4947,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path)
        if (kn) {
                if (kernfs_type(kn) == KERNFS_DIR) {
                        cgrp = kn->priv;
-                        cgroup_get(cgrp);
+                        cgroup_get_live(cgrp);
                } else {
                        cgrp = ERR_PTR(-ENOTDIR);
                }
@@ -5027,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
        /* Socket clone path */
        if (skcd->val) {
+                /*
+                 * We might be cloning a socket which is left in an empty
+                 * cgroup and the cgroup might have already been rmdir'd.
+                 * Don't use cgroup_get_live().
+                 */
                cgroup_get(sock_cgroup_ptr(skcd));
                return;
        }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f41292be0fb..f6501f4f6040 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2121,10 +2121,8 @@ int __init cpuset_init(void)
 {
        int err = 0;
-        if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+        BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
-                BUG();
+        BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
-        if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
-                BUG();
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
@@ -2139,8 +2137,7 @@ int __init cpuset_init(void)
        if (err < 0)
                return err;
-        if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
+        BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
-                BUG();
        return 0;
 }
@@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
                rebuild_sched_domains();
 }
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
 {
        /*
         * We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 96d38dab6fb2..66129eb4371d 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
                kfree(new_ns);
                return ERR_PTR(ret);
        }
-        atomic_set(&new_ns->count, 1);
+        refcount_set(&new_ns->count, 1);
        new_ns->ns.ops = &cgroupns_operations;
        return new_ns;
 }
diff --git a/kernel/compat.c b/kernel/compat.c
index 19aec5d98108..933bcb31ae10 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
 COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
                       struct timezone __user *, tz)
 {
+        struct timespec64 new_ts;
        struct timeval user_tv;
-        struct timespec new_ts;
        struct timezone new_tz;
        if (tv) {
@@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
                        return -EFAULT;
        }
-        return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
@@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
                       struct compat_timespec __user *, rmtp)
 {
        struct timespec tu, rmt;
+        struct timespec64 tu64;
        mm_segment_t oldfs;
        long ret;
        if (compat_get_timespec(&tu, rqtp))
                return -EFAULT;
-        if (!timespec_valid(&tu))
+        tu64 = timespec_to_timespec64(tu);
+        if (!timespec64_valid(&tu64))
                return -EINVAL;
        oldfs = get_fs();
        set_fs(KERNEL_DS);
-        ret = hrtimer_nanosleep(&tu,
+        ret = hrtimer_nanosleep(&tu64,
                                rmtp ? (struct timespec __user *)&rmt : NULL,
                                HRTIMER_MODE_REL, CLOCK_MONOTONIC);
        set_fs(oldfs);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 37b223e4fc05..9ae6fbe5b5cf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
 #endif /* CONFIG_PM_SLEEP_SMP */
+int __boot_cpu_id;
 #endif /* CONFIG_SMP */
 /* Boot processor state steps */
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void)
        set_cpu_active(cpu, true);
        set_cpu_present(cpu, true);
        set_cpu_possible(cpu, true);
+#ifdef CONFIG_SMP
+        __boot_cpu_id = cpu;
+#endif
 }
 /*
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
new file mode 100644
index 000000000000..fcbd568f1e95
--- /dev/null
+++ b/kernel/crash_core.c
@@ -0,0 +1,439 @@
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/crash_core.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <asm/page.h>
+#include <asm/sections.h>
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+                                        unsigned long long system_ram,
+                                        unsigned long long *crash_size,
+                                        unsigned long long *crash_base)
+{
+        char *cur = cmdline, *tmp;
+        /* for each entry of the comma-separated list */
+        do {
+                unsigned long long start, end = ULLONG_MAX, size;
+                /* get the start of the range */
+                start = memparse(cur, &tmp);
+                if (cur == tmp) {
+                        pr_warn("crashkernel: Memory value expected\n");
+                        return -EINVAL;
+                }
+                cur = tmp;
+                if (*cur != '-') {
+                        pr_warn("crashkernel: '-' expected\n");
+                        return -EINVAL;
+                }
+                cur++;
+                /* if no ':' is here, than we read the end */
+                if (*cur != ':') {
+                        end = memparse(cur, &tmp);
+                        if (cur == tmp) {
+                                pr_warn("crashkernel: Memory value expected\n");
+                                return -EINVAL;
+                        }
+                        cur = tmp;
+                        if (end <= start) {
+                                pr_warn("crashkernel: end <= start\n");
+                                return -EINVAL;
+                        }
+                }
+                if (*cur != ':') {
+                        pr_warn("crashkernel: ':' expected\n");
+                        return -EINVAL;
+                }
+                cur++;
+                size = memparse(cur, &tmp);
+                if (cur == tmp) {
+                        pr_warn("Memory value expected\n");
+                        return -EINVAL;
+                }
+                cur = tmp;
+                if (size >= system_ram) {
+                        pr_warn("crashkernel: invalid size\n");
+                        return -EINVAL;
+                }
+                /* match ? */
+                if (system_ram >= start && system_ram < end) {
+                        *crash_size = size;
+                        break;
+                }
+        } while (*cur++ == ',');
+        if (*crash_size > 0) {
+                while (*cur && *cur != ' ' && *cur != '@')
+                        cur++;
+                if (*cur == '@') {
+                        cur++;
+                        *crash_base = memparse(cur, &tmp);
+                        if (cur == tmp) {
+                                pr_warn("Memory value expected after '@'\n");
+                                return -EINVAL;
+                        }
+                }
+        }
+        return 0;
+}
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *      crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+                                           unsigned long long *crash_size,
+                                           unsigned long long *crash_base)
+{
+        char *cur = cmdline;
+        *crash_size = memparse(cmdline, &cur);
+        if (cmdline == cur) {
+                pr_warn("crashkernel: memory value expected\n");
+                return -EINVAL;
+        }
+        if (*cur == '@')
+                *crash_base = memparse(cur+1, &cur);
+        else if (*cur != ' ' && *cur != '\0') {
+                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+                return -EINVAL;
+        }
+        return 0;
+}
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+        [SUFFIX_HIGH] = ",high",
+        [SUFFIX_LOW]  = ",low",
+        [SUFFIX_NULL] = NULL,
+};
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *      crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+                                           unsigned long long   *crash_size,
+                                           const char *suffix)
+{
+        char *cur = cmdline;
+        *crash_size = memparse(cmdline, &cur);
+        if (cmdline == cur) {
+                pr_warn("crashkernel: memory value expected\n");
+                return -EINVAL;
+        }
+        /* check with suffix */
+        if (strncmp(cur, suffix, strlen(suffix))) {
+                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+                return -EINVAL;
+        }
+        cur += strlen(suffix);
+        if (*cur != ' ' && *cur != '\0') {
+                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+                return -EINVAL;
+        }
+        return 0;
+}
+static __init char *get_last_crashkernel(char *cmdline,
+                             const char *name,
+                             const char *suffix)
+{
+        char *p = cmdline, *ck_cmdline = NULL;
+        /* find crashkernel and use the last one if there are more */
+        p = strstr(p, name);
+        while (p) {
+                char *end_p = strchr(p, ' ');
+                char *q;
+                if (!end_p)
+                        end_p = p + strlen(p);
+                if (!suffix) {
+                        int i;
+                        /* skip the one with any known suffix */
+                        for (i = 0; suffix_tbl[i]; i++) {
+                                q = end_p - strlen(suffix_tbl[i]);
+                                if (!strncmp(q, suffix_tbl[i],
+                                             strlen(suffix_tbl[i])))
+                                        goto next;
+                        }
+                        ck_cmdline = p;
+                } else {
+                        q = end_p - strlen(suffix);
+                        if (!strncmp(q, suffix, strlen(suffix)))
+                                ck_cmdline = p;
+                }
+next:
+                p = strstr(p+1, name);
+        }
+        if (!ck_cmdline)
+                return NULL;
+        return ck_cmdline;
+}
+static int __init __parse_crashkernel(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base,
+                             const char *name,
+                             const char *suffix)
+{
+        char    *first_colon, *first_space;
+        char    *ck_cmdline;
+        BUG_ON(!crash_size || !crash_base);
+        *crash_size = 0;
+        *crash_base = 0;
+        ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+        if (!ck_cmdline)
+                return -EINVAL;
+        ck_cmdline += strlen(name);
+        if (suffix)
+                return parse_crashkernel_suffix(ck_cmdline, crash_size,
+                                suffix);
+        /*
+         * if the commandline contains a ':', then that's the extended
+         * syntax -- if not, it must be the classic syntax
+         */
+        first_colon = strchr(ck_cmdline, ':');
+        first_space = strchr(ck_cmdline, ' ');
+        if (first_colon && (!first_space || first_colon < first_space))
+                return parse_crashkernel_mem(ck_cmdline, system_ram,
+                                crash_size, crash_base);
+        return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                        "crashkernel=", NULL);
+}
+int __init parse_crashkernel_high(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+int __init parse_crashkernel_low(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+                          void *data, size_t data_len)
+{
+        struct elf_note *note = (struct elf_note *)buf;
+        note->n_namesz = strlen(name) + 1;
+        note->n_descsz = data_len;
+        note->n_type   = type;
+        buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+        memcpy(buf, name, note->n_namesz);
+        buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+        memcpy(buf, data, data_len);
+        buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
+        return buf;
+}
+void final_note(Elf_Word *buf)
+{
+        memset(buf, 0, sizeof(struct elf_note));
+}
+static void update_vmcoreinfo_note(void)
+{
+        u32 *buf = vmcoreinfo_note;
+        if (!vmcoreinfo_size)
+                return;
+        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+                              vmcoreinfo_size);
+        final_note(buf);
+}
+void crash_save_vmcoreinfo(void)
+{
+        vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+        update_vmcoreinfo_note();
+}
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+        va_list args;
+        char buf[0x50];
+        size_t r;
+        va_start(args, fmt);
+        r = vscnprintf(buf, sizeof(buf), fmt, args);
+        va_end(args);
+        r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+        memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+        vmcoreinfo_size += r;
+}
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+        return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+}
+static int __init crash_save_vmcoreinfo_init(void)
+{
+        VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+        VMCOREINFO_PAGESIZE(PAGE_SIZE);
+        VMCOREINFO_SYMBOL(init_uts_ns);
+        VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+        VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+        VMCOREINFO_SYMBOL(_stext);
+        VMCOREINFO_SYMBOL(vmap_area_list);
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+        VMCOREINFO_SYMBOL(mem_map);
+        VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+        VMCOREINFO_SYMBOL(mem_section);
+        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+        VMCOREINFO_STRUCT_SIZE(mem_section);
+        VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+        VMCOREINFO_STRUCT_SIZE(page);
+        VMCOREINFO_STRUCT_SIZE(pglist_data);
+        VMCOREINFO_STRUCT_SIZE(zone);
+        VMCOREINFO_STRUCT_SIZE(free_area);
+        VMCOREINFO_STRUCT_SIZE(list_head);
+        VMCOREINFO_SIZE(nodemask_t);
+        VMCOREINFO_OFFSET(page, flags);
+        VMCOREINFO_OFFSET(page, _refcount);
+        VMCOREINFO_OFFSET(page, mapping);
+        VMCOREINFO_OFFSET(page, lru);
+        VMCOREINFO_OFFSET(page, _mapcount);
+        VMCOREINFO_OFFSET(page, private);
+        VMCOREINFO_OFFSET(page, compound_dtor);
+        VMCOREINFO_OFFSET(page, compound_order);
+        VMCOREINFO_OFFSET(page, compound_head);
+        VMCOREINFO_OFFSET(pglist_data, node_zones);
+        VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+        VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+        VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+        VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+        VMCOREINFO_OFFSET(pglist_data, node_id);
+        VMCOREINFO_OFFSET(zone, free_area);
+        VMCOREINFO_OFFSET(zone, vm_stat);
+        VMCOREINFO_OFFSET(zone, spanned_pages);
+        VMCOREINFO_OFFSET(free_area, free_list);
+        VMCOREINFO_OFFSET(list_head, next);
+        VMCOREINFO_OFFSET(list_head, prev);
+        VMCOREINFO_OFFSET(vmap_area, va_start);
+        VMCOREINFO_OFFSET(vmap_area, list);
+        VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+        log_buf_vmcoreinfo_setup();
+        VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+        VMCOREINFO_NUMBER(NR_FREE_PAGES);
+        VMCOREINFO_NUMBER(PG_lru);
+        VMCOREINFO_NUMBER(PG_private);
+        VMCOREINFO_NUMBER(PG_swapcache);
+        VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+        VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+        VMCOREINFO_NUMBER(PG_head_mask);
+        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLB_PAGE
+        VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#endif
+        arch_crash_save_vmcoreinfo();
+        update_vmcoreinfo_note();
+        return 0;
+}
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c04917cad1bf..1b2be63c8528 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -229,12 +229,18 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
                }
                if (regs) {
+                        mm_segment_t fs;
                        if (crosstask)
                                goto exit_put;
                        if (add_mark)
                                perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+                        fs = get_fs();
+                        set_fs(USER_DS);
                        perf_callchain_user(&ctx, regs);
+                        set_fs(fs);
                }
        }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ff01cba86f43..6e75a5c9412d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -48,6 +48,8 @@
 #include <linux/parser.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
 #include "internal.h"
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event)
                atomic_dec(&nr_mmap_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
+        if (event->attr.namespaces)
+                atomic_dec(&nr_namespaces_events);
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task,
 void perf_event_fork(struct task_struct *task)
 {
        perf_event_task(task, NULL, 1);
+        perf_event_namespaces(task);
 }
 /*
@@ -6593,6 +6599,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
 }
 /*
+ * namespaces tracking
+ */
+struct perf_namespaces_event {
+        struct task_struct              *task;
+        struct {
+                struct perf_event_header        header;
+                u32                             pid;
+                u32                             tid;
+                u64                             nr_namespaces;
+                struct perf_ns_link_info        link_info[NR_NAMESPACES];
+        } event_id;
+};
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+        return event->attr.namespaces;
+}
+static void perf_event_namespaces_output(struct perf_event *event,
+                                         void *data)
+{
+        struct perf_namespaces_event *namespaces_event = data;
+        struct perf_output_handle handle;
+        struct perf_sample_data sample;
+        int ret;
+        if (!perf_event_namespaces_match(event))
+                return;
+        perf_event_header__init_id(&namespaces_event->event_id.header,
+                                   &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                namespaces_event->event_id.header.size);
+        if (ret)
+                return;
+        namespaces_event->event_id.pid = perf_event_pid(event,
+                                                        namespaces_event->task);
+        namespaces_event->event_id.tid = perf_event_tid(event,
+                                                        namespaces_event->task);
+        perf_output_put(&handle, namespaces_event->event_id);
+        perf_event__output_id_sample(event, &handle, &sample);
+        perf_output_end(&handle);
+}
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+                                   struct task_struct *task,
+                                   const struct proc_ns_operations *ns_ops)
+{
+        struct path ns_path;
+        struct inode *ns_inode;
+        void *error;
+        error = ns_get_path(&ns_path, task, ns_ops);
+        if (!error) {
+                ns_inode = ns_path.dentry->d_inode;
+                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+                ns_link_info->ino = ns_inode->i_ino;
+        }
+}
+void perf_event_namespaces(struct task_struct *task)
+{
+        struct perf_namespaces_event namespaces_event;
+        struct perf_ns_link_info *ns_link_info;
+        if (!atomic_read(&nr_namespaces_events))
+                return;
+        namespaces_event = (struct perf_namespaces_event){
+                .task   = task,
+                .event_id  = {
+                        .header = {
+                                .type = PERF_RECORD_NAMESPACES,
+                                .misc = 0,
+                                .size = sizeof(namespaces_event.event_id),
+                        },
+                        /* .pid */
+                        /* .tid */
+                        .nr_namespaces = NR_NAMESPACES,
+                        /* .link_info[NR_NAMESPACES] */
+                },
+        };
+        ns_link_info = namespaces_event.event_id.link_info;
+        perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+                               task, &mntns_operations);
+#ifdef CONFIG_USER_NS
+        perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+                               task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+        perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+                               task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+        perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+                               task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+        perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+                               task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+        perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+                               task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+        perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+                               task, &cgroupns_operations);
+#endif
+        perf_iterate_sb(perf_event_namespaces_output,
+                        &namespaces_event,
+                        NULL);
+}
+/*
 * mmap tracking
 */
@@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event)
                atomic_inc(&nr_mmap_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
+        if (event->attr.namespaces)
+                atomic_inc(&nr_namespaces_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
        if (event->attr.freq)
@@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EACCES;
        }
+        if (attr.namespaces) {
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EACCES;
+        }
        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 257fa460b846..2831480c63a2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
                rb->paused = 1;
 }
+void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
+{
+        /*
+         * OVERWRITE is determined by perf_aux_output_end() and can't
+         * be passed in directly.
+         */
+        if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
+                return;
+        handle->aux_flags |= flags;
+}
+EXPORT_SYMBOL_GPL(perf_aux_output_flag);
 /*
 * This is called before hardware starts writing to the AUX area to
 * obtain an output handle and make sure there's room in the buffer.
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
        handle->event = event;
        handle->head = aux_head;
        handle->size = 0;
+        handle->aux_flags = 0;
        /*
         * In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -408,34 +422,32 @@ err:
 * of the AUX buffer management code is that after pmu::stop(), the AUX
 * transaction must be stopped and therefore drop the AUX reference count.
 */
-void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
-                         bool truncated)
 {
+        bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
        struct ring_buffer *rb = handle->rb;
-        bool wakeup = truncated;
        unsigned long aux_head;
-        u64 flags = 0;
-        if (truncated)
-                flags |= PERF_AUX_FLAG_TRUNCATED;
        /* in overwrite mode, driver provides aux_head via handle */
        if (rb->aux_overwrite) {
-                flags |= PERF_AUX_FLAG_OVERWRITE;
+                handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
                aux_head = handle->head;
                local_set(&rb->aux_head, aux_head);
        } else {
+                handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
                aux_head = local_read(&rb->aux_head);
                local_add(size, &rb->aux_head);
        }
-        if (size || flags) {
+        if (size || handle->aux_flags) {
                /*
                 * Only send RECORD_AUX if we have something useful to communicate
                 */
-                perf_event_aux_event(handle->event, aux_head, size, flags);
+                perf_event_aux_event(handle->event, aux_head, size,
+                                     handle->aux_flags);
        }
        aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
        }
        if (wakeup) {
-                if (truncated)
+                if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
                        handle->event->pending_disable = 1;
                perf_output_wakeup(handle);
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..aa1076c5e4a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,7 @@
 #include <linux/compiler.h>
 #include <linux/sysctl.h>
 #include <linux/kcov.h>
+#include <linux/livepatch.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -178,6 +179,24 @@ void __weak arch_release_thread_stack(unsigned long *stack)
 */
 #define NR_CACHED_STACKS 2
 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+static int free_vm_stack_cache(unsigned int cpu)
+{
+        struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
+        int i;
+        for (i = 0; i < NR_CACHED_STACKS; i++) {
+                struct vm_struct *vm_stack = cached_vm_stacks[i];
+                if (!vm_stack)
+                        continue;
+                vfree(vm_stack->addr);
+                cached_vm_stacks[i] = NULL;
+        }
+        return 0;
+}
 #endif
 static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
@@ -202,7 +221,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
        stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
                                     VMALLOC_START, VMALLOC_END,
-                                     THREADINFO_GFP | __GFP_HIGHMEM,
+                                     THREADINFO_GFP,
                                     PAGE_KERNEL,
                                     0, node, __builtin_return_address(0));
@@ -466,6 +485,11 @@ void __init fork_init(void)
        for (i = 0; i < UCOUNT_COUNTS; i++) {
                init_user_ns.ucount_max[i] = max_threads/2;
        }
+#ifdef CONFIG_VMAP_STACK
+        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
+                          NULL, free_vm_stack_cache);
+#endif
 }
 int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -536,7 +560,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        set_task_stack_end_magic(tsk);
 #ifdef CONFIG_CC_STACKPROTECTOR
-        tsk->stack_canary = get_random_int();
+        tsk->stack_canary = get_random_long();
 #endif
        /*
@@ -1313,7 +1337,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
        if (atomic_dec_and_test(&sighand->count)) {
                signalfd_cleanup(sighand);
                /*
-                 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+                 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
                 * without an RCU grace period, see __lock_task_sighand().
                 */
                kmem_cache_free(sighand_cachep, sighand);
@@ -1438,6 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT;
        p->pi_waiters_leftmost = NULL;
+        p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
 #endif
 }
@@ -1679,9 +1704,12 @@ static __latent_entropy struct task_struct *copy_process(
                goto bad_fork_cleanup_perf;
        /* copy all the process information */
        shm_init_task(p);
-        retval = copy_semundo(clone_flags, p);
+        retval = security_task_alloc(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_audit;
+        retval = copy_semundo(clone_flags, p);
+        if (retval)
+                goto bad_fork_cleanup_security;
        retval = copy_files(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_semundo;
@@ -1797,6 +1825,8 @@ static __latent_entropy struct task_struct *copy_process(
                p->parent_exec_id = current->self_exec_id;
        }
+        klp_copy_process(p);
        spin_lock(&current->sighand->siglock);
        /*
@@ -1815,11 +1845,13 @@ static __latent_entropy struct task_struct *copy_process(
        */
        recalc_sigpending();
        if (signal_pending(current)) {
-                spin_unlock(&current->sighand->siglock);
-                write_unlock_irq(&tasklist_lock);
                retval = -ERESTARTNOINTR;
                goto bad_fork_cancel_cgroup;
        }
+        if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+                retval = -ENOMEM;
+                goto bad_fork_cancel_cgroup;
+        }
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1877,6 +1909,8 @@ static __latent_entropy struct task_struct *copy_process(
        return p;
 bad_fork_cancel_cgroup:
+        spin_unlock(&current->sighand->siglock);
+        write_unlock_irq(&tasklist_lock);
        cgroup_cancel_fork(p);
 bad_fork_free_pid:
        cgroup_threadgroup_change_end(current);
@@ -1903,6 +1937,8 @@ bad_fork_cleanup_files:
        exit_files(p); /* blocking */
 bad_fork_cleanup_semundo:
        exit_sem(p);
+bad_fork_cleanup_security:
+        security_task_free(p);
 bad_fork_cleanup_audit:
        audit_free(p);
 bad_fork_cleanup_perf:
@@ -2144,7 +2180,7 @@ void __init proc_caches_init(void)
 {
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
@@ -2352,6 +2388,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                }
        }
+        perf_event_namespaces(current);
 bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 45858ec73941..357348a6cf6b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
        return 0;
 }
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
 {
        struct futex_pi_state *pi_state = current->pi_state_cache;
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void)
        return pi_state;
 }
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+        WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
 /*
 * Drops a reference to the pi_state object and frees or caches it
 * when the last reference is gone.
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
 * Look up the task based on what TID userspace gave us.
 * We dont trust it.
 */
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
 {
        struct task_struct *p;
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
                pi_state->owner = NULL;
                raw_spin_unlock_irq(&curr->pi_lock);
-                rt_mutex_unlock(&pi_state->pi_mutex);
+                get_pi_state(pi_state);
                spin_unlock(&hb->lock);
+                rt_mutex_futex_unlock(&pi_state->pi_mutex);
+                put_pi_state(pi_state);
                raw_spin_lock_irq(&curr->pi_lock);
        }
        raw_spin_unlock_irq(&curr->pi_lock);
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
 *
 * [10] There is no transient state which leaves owner and user space
 *      TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ *      hb -> futex_q, relation
+ *      futex_q -> pi_state, relation
+ *
+ *      (cannot be raw because hb can contain arbitrary amount
+ *       of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ *      {uval, pi_state}
+ *
+ *      (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ *      p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ *      pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ *   hb->lock
+ *     pi_mutex->wait_lock
+ *       p->pi_lock
+ *
 */
 /*
@@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr)
 * the pi_state against the user space value. If correct, attach to
 * it.
 */
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+                              struct futex_pi_state *pi_state,
                              struct futex_pi_state **ps)
 {
        pid_t pid = uval & FUTEX_TID_MASK;
+        u32 uval2;
+        int ret;
        /*
         * Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
        if (unlikely(!pi_state))
                return -EINVAL;
+        /*
+         * We get here with hb->lock held, and having found a
+         * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+         * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+         * which in turn means that futex_lock_pi() still has a reference on
+         * our pi_state.
+         *
+         * The waiter holding a reference on @pi_state also protects against
+         * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+         * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+         * free pi_state before we can take a reference ourselves.
+         */
        WARN_ON(!atomic_read(&pi_state->refcount));
        /*
+         * Now that we have a pi_state, we can acquire wait_lock
+         * and do the state validation.
+         */
+        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+        /*
+         * Since {uval, pi_state} is serialized by wait_lock, and our current
+         * uval was read without holding it, it can have changed. Verify it
+         * still is what we expect it to be, otherwise retry the entire
+         * operation.
+         */
+        if (get_futex_value_locked(&uval2, uaddr))
+                goto out_efault;
+        if (uval != uval2)
+                goto out_eagain;
+        /*
         * Handle the owner died case:
         */
        if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
                         * is not 0. Inconsistent state. [5]
                         */
                        if (pid)
-                                return -EINVAL;
+                                goto out_einval;
                        /*
                         * Take a ref on the state and return success. [4]
                         */
-                        goto out_state;
+                        goto out_attach;
                }
                /*
@@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
                 * Take a ref on the state and return success. [6]
                 */
                if (!pid)
-                        goto out_state;
+                        goto out_attach;
        } else {
                /*
                 * If the owner died bit is not set, then the pi_state
                 * must have an owner. [7]
                 */
                if (!pi_state->owner)
-                        return -EINVAL;
+                        goto out_einval;
        }
        /*
@@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
         * user space TID. [9/10]
         */
        if (pid != task_pid_vnr(pi_state->owner))
-                return -EINVAL;
+                goto out_einval;
-out_state:
-        atomic_inc(&pi_state->refcount);
+out_attach:
+        get_pi_state(pi_state);
+        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        *ps = pi_state;
        return 0;
+out_einval:
+        ret = -EINVAL;
+        goto out_error;
+out_eagain:
+        ret = -EAGAIN;
+        goto out_error;
+out_efault:
+        ret = -EFAULT;
+        goto out_error;
+out_error:
+        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+        return ret;
 }
 /*
@@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        /*
         * No existing pi state. First waiter. [2]
+         *
+         * This creates pi_state, we have hb->lock held, this means nothing can
+         * observe this state, wait_lock is irrelevant.
         */
        pi_state = alloc_pi_state();
@@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        return 0;
 }
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+                           struct futex_hash_bucket *hb,
                           union futex_key *key, struct futex_pi_state **ps)
 {
-        struct futex_q *match = futex_top_waiter(hb, key);
+        struct futex_q *top_waiter = futex_top_waiter(hb, key);
        /*
         * If there is a waiter on that futex, validate it and
         * attach to the pi_state when the validation succeeds.
         */
-        if (match)
+        if (top_waiter)
-                return attach_to_pi_state(uval, match->pi_state, ps);
+                return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
        /*
         * We are the first waiter - try to look up the owner based on
@@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
                return -EFAULT;
-        /*If user space value changed, let the caller retry */
+        /* If user space value changed, let the caller retry */
        return curval != uval ? -EAGAIN : 0;
 }
@@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct task_struct *task, int set_waiters)
 {
        u32 uval, newval, vpid = task_pid_vnr(task);
-        struct futex_q *match;
+        struct futex_q *top_waiter;
        int ret;
        /*
@@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
         * Lookup existing state first. If it exists, try to attach to
         * its pi_state.
         */
-        match = futex_top_waiter(hb, key);
+        top_waiter = futex_top_waiter(hb, key);
-        if (match)
+        if (top_waiter)
-                return attach_to_pi_state(uval, match->pi_state, ps);
+                return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
        /*
         * No waiter and user TID is 0. We are here because the
@@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
        wake_q_add(wake_q, p);
        __unqueue_futex(q);
        /*
-         * The waiting task can free the futex_q as soon as
+         * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
-         * q->lock_ptr = NULL is written, without taking any locks. A
+         * is written, without taking any locks. This is possible in the event
-         * memory barrier is required here to prevent the following
+         * of a spurious wakeup, for example. A memory barrier is required here
-         * store to lock_ptr from getting ahead of the plist_del.
+         * to prevent the following store to lock_ptr from getting ahead of the
+         * plist_del in __unqueue_futex().
         */
-        smp_wmb();
+        smp_store_release(&q->lock_ptr, NULL);
-        q->lock_ptr = NULL;
 }
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+/*
-                         struct futex_hash_bucket *hb)
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
 {
-        struct task_struct *new_owner;
-        struct futex_pi_state *pi_state = this->pi_state;
        u32 uninitialized_var(curval), newval;
+        struct task_struct *new_owner;
+        bool postunlock = false;
        DEFINE_WAKE_Q(wake_q);
-        bool deboost;
        int ret = 0;
-        if (!pi_state)
-                return -EINVAL;
-        /*
-         * If current does not own the pi_state then the futex is
-         * inconsistent and user space fiddled with the futex value.
-         */
-        if (pi_state->owner != current)
-                return -EINVAL;
-        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+        if (WARN_ON_ONCE(!new_owner)) {
+                /*
+                 * As per the comment in futex_unlock_pi() this should not happen.
+                 *
+                 * When this happens, give up our locks and try again, giving
+                 * the futex_lock_pi() instance time to complete, either by
+                 * waiting on the rtmutex or removing itself from the futex
+                 * queue.
+                 */
+                ret = -EAGAIN;
+                goto out_unlock;
+        }
        /*
-         * It is possible that the next waiter (the one that brought
+         * We pass it to the next owner. The WAITERS bit is always kept
-         * this owner to the kernel) timed out and is no longer
+         * enabled while there is PI state around. We cleanup the owner
-         * waiting on the lock.
+         * died bit, because we are the owner.
-         */
-        if (!new_owner)
-                new_owner = this->task;
-        /*
-         * We pass it to the next owner. The WAITERS bit is always
-         * kept enabled while there is PI state around. We cleanup the
-         * owner died bit, because we are the owner.
         */
        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
@@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
                ret = -EFAULT;
        } else if (curval != uval) {
                /*
                 * If a unconditional UNLOCK_PI operation (user space did not
@@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
                else
                        ret = -EINVAL;
        }
-        if (ret) {
-                raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+        if (ret)
-                return ret;
+                goto out_unlock;
-        }
+        /*
+         * This is a point of no return; once we modify the uval there is no
+         * going back and subsequent operations must not fail.
+         */
        raw_spin_lock(&pi_state->owner->pi_lock);
        WARN_ON(list_empty(&pi_state->list));
@@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
        pi_state->owner = new_owner;
        raw_spin_unlock(&new_owner->pi_lock);
-        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+        postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
-        deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+out_unlock:
+        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-        /*
+        if (postunlock)
-         * First unlock HB so the waiter does not spin on it once he got woken
+                rt_mutex_postunlock(&wake_q);
-         * up. Second wake up the waiter before the priority is adjusted. If we
-         * deboost first (and lose our higher priority), then the task might get
-         * scheduled away before the wake up can take place.
-         */
-        spin_unlock(&hb->lock);
-        wake_up_q(&wake_q);
-        if (deboost)
-                rt_mutex_adjust_prio(current);
-        return 0;
+        return ret;
 }
 /*
@@ -1826,7 +1913,7 @@ retry_private:
                         * If that call succeeds then we have pi_state and an
                         * initial refcount on it.
                         */
-                        ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+                        ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
                }
                switch (ret) {
@@ -1909,7 +1996,7 @@ retry_private:
                         * refcount on the pi_state and store the pointer in
                         * the futex_q object of the waiter.
                         */
-                        atomic_inc(&pi_state->refcount);
+                        get_pi_state(pi_state);
                        this->pi_state = pi_state;
                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
                                                        this->rt_waiter,
@@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb)
        hb_waiters_dec(hb);
 }
-/**
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q:  The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me().  The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
-        __releases(&hb->lock)
 {
        int prio;
@@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
        plist_node_init(&q->list, prio);
        plist_add(&q->list, &hb->chain);
        q->task = current;
+}
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q:  The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me().  The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+        __releases(&hb->lock)
+{
+        __queue_me(q, hb);
        spin_unlock(&hb->lock);
 }
@@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
-        struct task_struct *oldowner = pi_state->owner;
        u32 uval, uninitialized_var(curval), newval;
+        struct task_struct *oldowner;
        int ret;
+        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+        oldowner = pi_state->owner;
        /* Owner died? */
        if (!pi_state->owner)
                newtid |= FUTEX_OWNER_DIED;
@@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        /*
         * We are here either because we stole the rtmutex from the
         * previous highest priority waiter or we are the highest priority
-         * waiter but failed to get the rtmutex the first time.
+         * waiter but have failed to get the rtmutex the first time.
+         *
         * We have to replace the newowner TID in the user space variable.
         * This must be atomic as we have to preserve the owner died bit here.
         *
@@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
         * because we can fault here. Imagine swapped out pages or a fork
         * that marked all the anonymous memory readonly for cow.
         *
-         * Modifying pi_state _before_ the user space value would
+         * Modifying pi_state _before_ the user space value would leave the
-         * leave the pi_state in an inconsistent state when we fault
+         * pi_state in an inconsistent state when we fault here, because we
-         * here, because we need to drop the hash bucket lock to
+         * need to drop the locks to handle the fault. This might be observed
-         * handle the fault. This might be observed in the PID check
+         * in the PID check in lookup_pi_state.
-         * in lookup_pi_state.
         */
 retry:
        if (get_futex_value_locked(&uval, uaddr))
                goto handle_fault;
-        while (1) {
+        for (;;) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2169,47 +2264,60 @@ retry:
         * itself.
         */
        if (pi_state->owner != NULL) {
-                raw_spin_lock_irq(&pi_state->owner->pi_lock);
+                raw_spin_lock(&pi_state->owner->pi_lock);
                WARN_ON(list_empty(&pi_state->list));
                list_del_init(&pi_state->list);
-                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+                raw_spin_unlock(&pi_state->owner->pi_lock);
        }
        pi_state->owner = newowner;
-        raw_spin_lock_irq(&newowner->pi_lock);
+        raw_spin_lock(&newowner->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &newowner->pi_state_list);
-        raw_spin_unlock_irq(&newowner->pi_lock);
+        raw_spin_unlock(&newowner->pi_lock);
+        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        return 0;
        /*
-         * To handle the page fault we need to drop the hash bucket
+         * To handle the page fault we need to drop the locks here. That gives
-         * lock here. That gives the other task (either the highest priority
+         * the other task (either the highest priority waiter itself or the
-         * waiter itself or the task which stole the rtmutex) the
+         * task which stole the rtmutex) the chance to try the fixup of the
-         * chance to try the fixup of the pi_state. So once we are
+         * pi_state. So once we are back from handling the fault we need to
-         * back from handling the fault we need to check the pi_state
+         * check the pi_state after reacquiring the locks and before trying to
-         * after reacquiring the hash bucket lock and before trying to
+         * do another fixup. When the fixup has been done already we simply
-         * do another fixup. When the fixup has been done already we
+         * return.
-         * simply return.
+         *
+         * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+         * drop hb->lock since the caller owns the hb -> futex_q relation.
+         * Dropping the pi_mutex->wait_lock requires the state revalidate.
         */
 handle_fault:
+        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        spin_unlock(q->lock_ptr);
        ret = fault_in_user_writeable(uaddr);
        spin_lock(q->lock_ptr);
+        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
        /*
         * Check if someone else fixed it for us:
         */
-        if (pi_state->owner != oldowner)
+        if (pi_state->owner != oldowner) {
-                return 0;
+                ret = 0;
+                goto out_unlock;
+        }
        if (ret)
-                return ret;
+                goto out_unlock;
        goto retry;
+out_unlock:
+        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+        return ret;
 }
 static long futex_wait_restart(struct restart_block *restart);
@@ -2231,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart);
 */
 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 {
-        struct task_struct *owner;
        int ret = 0;
        if (locked) {
                /*
                 * Got the lock. We might not be the anticipated owner if we
                 * did a lock-steal - fix up the PI-state in that case:
+                 *
+                 * We can safely read pi_state->owner without holding wait_lock
+                 * because we now own the rt_mutex, only the owner will attempt
+                 * to change it.
                 */
                if (q->pi_state->owner != current)
                        ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2245,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
        }
        /*
-         * Catch the rare case, where the lock was released when we were on the
-         * way back before we locked the hash bucket.
-         */
-        if (q->pi_state->owner == current) {
-                /*
-                 * Try to get the rt_mutex now. This might fail as some other
-                 * task acquired the rt_mutex after we removed ourself from the
-                 * rt_mutex waiters list.
-                 */
-                if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
-                        locked = 1;
-                        goto out;
-                }
-                /*
-                 * pi_state is incorrect, some other task did a lock steal and
-                 * we returned due to timeout or signal without taking the
-                 * rt_mutex. Too late.
-                 */
-                raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
-                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-                if (!owner)
-                        owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
-                raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
-                ret = fixup_pi_state_owner(uaddr, q, owner);
-                goto out;
-        }
-        /*
         * Paranoia check. If we did not take the lock, then we should not be
         * the owner of the rt_mutex.
         */
-        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
                                "pi-state %p\n", ret,
                                q->pi_state->pi_mutex.owner,
                                q->pi_state->owner);
+        }
 out:
        return ret ? ret : locked;
@@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
+        struct futex_pi_state *pi_state = NULL;
+        struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
        struct futex_q q = futex_q_init;
        int res, ret;
@@ -2557,25 +2642,68 @@ retry_private:
                }
        }
+        WARN_ON(!q.pi_state);
        /*
         * Only actually queue now that the atomic ops are done:
         */
-        queue_me(&q, hb);
+        __queue_me(&q, hb);
-        WARN_ON(!q.pi_state);
+        if (trylock) {
-        /*
+                ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
-         * Block on the PI mutex:
-         */
-        if (!trylock) {
-                ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
-        } else {
-                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
                /* Fixup the trylock return value: */
                ret = ret ? 0 : -EWOULDBLOCK;
+                goto no_block;
        }
+        rt_mutex_init_waiter(&rt_waiter);
+        /*
+         * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+         * hold it while doing rt_mutex_start_proxy(), because then it will
+         * include hb->lock in the blocking chain, even through we'll not in
+         * fact hold it while blocking. This will lead it to report -EDEADLK
+         * and BUG when futex_unlock_pi() interleaves with this.
+         *
+         * Therefore acquire wait_lock while holding hb->lock, but drop the
+         * latter before calling rt_mutex_start_proxy_lock(). This still fully
+         * serializes against futex_unlock_pi() as that does the exact same
+         * lock handoff sequence.
+         */
+        raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+        spin_unlock(q.lock_ptr);
+        ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+        raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+        if (ret) {
+                if (ret == 1)
+                        ret = 0;
+                spin_lock(q.lock_ptr);
+                goto no_block;
+        }
+        if (unlikely(to))
+                hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+        ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
        spin_lock(q.lock_ptr);
        /*
+         * If we failed to acquire the lock (signal/timeout), we must
+         * first acquire the hb->lock before removing the lock from the
+         * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+         * wait lists consistent.
+         *
+         * In particular; it is important that futex_unlock_pi() can not
+         * observe this inconsistency.
+         */
+        if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+                ret = 0;
+no_block:
+        /*
         * Fixup the pi_state owner and possibly acquire the lock if we
         * haven't already.
         */
@@ -2591,12 +2719,19 @@ retry_private:
         * If fixup_owner() faulted and was unable to handle the fault, unlock
         * it and return the fault to userspace.
         */
-        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
+        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
-                rt_mutex_unlock(&q.pi_state->pi_mutex);
+                pi_state = q.pi_state;
+                get_pi_state(pi_state);
+        }
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
+        if (pi_state) {
+                rt_mutex_futex_unlock(&pi_state->pi_mutex);
+                put_pi_state(pi_state);
+        }
        goto out_put_key;
 out_unlock_put_key:
@@ -2605,8 +2740,10 @@ out_unlock_put_key:
 out_put_key:
        put_futex_key(&q.key);
 out:
-        if (to)
+        if (to) {
+                hrtimer_cancel(&to->timer);
                destroy_hrtimer_on_stack(&to->timer);
+        }
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 uaddr_faulted:
@@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
        u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
        union futex_key key = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb;
-        struct futex_q *match;
+        struct futex_q *top_waiter;
        int ret;
 retry:
@@ -2657,12 +2794,37 @@ retry:
         * all and we at least want to know if user space fiddled
         * with the futex value instead of blindly unlocking.
         */
-        match = futex_top_waiter(hb, &key);
+        top_waiter = futex_top_waiter(hb, &key);
-        if (match) {
+        if (top_waiter) {
-                ret = wake_futex_pi(uaddr, uval, match, hb);
+                struct futex_pi_state *pi_state = top_waiter->pi_state;
+                ret = -EINVAL;
+                if (!pi_state)
+                        goto out_unlock;
                /*
-                 * In case of success wake_futex_pi dropped the hash
+                 * If current does not own the pi_state then the futex is
-                 * bucket lock.
+                 * inconsistent and user space fiddled with the futex value.
+                 */
+                if (pi_state->owner != current)
+                        goto out_unlock;
+                get_pi_state(pi_state);
+                /*
+                 * By taking wait_lock while still holding hb->lock, we ensure
+                 * there is no point where we hold neither; and therefore
+                 * wake_futex_pi() must observe a state consistent with what we
+                 * observed.
+                 */
+                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+                spin_unlock(&hb->lock);
+                ret = wake_futex_pi(uaddr, uval, pi_state);
+                put_pi_state(pi_state);
+                /*
+                 * Success, we're done! No tricky corner cases.
                 */
                if (!ret)
                        goto out_putkey;
@@ -2677,7 +2839,6 @@ retry:
                 * setting the FUTEX_WAITERS bit. Try again.
                 */
                if (ret == -EAGAIN) {
-                        spin_unlock(&hb->lock);
                        put_futex_key(&key);
                        goto retry;
                }
@@ -2685,7 +2846,7 @@ retry:
                 * wake_futex_pi has detected invalid state. Tell user
                 * space.
                 */
-                goto out_unlock;
+                goto out_putkey;
        }
        /*
@@ -2695,8 +2856,10 @@ retry:
         * preserve the WAITERS bit not the OWNER_DIED one. We are the
         * owner.
         */
-        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+                spin_unlock(&hb->lock);
                goto pi_faulted;
+        }
        /*
         * If uval has changed, let user space handle it.
@@ -2710,7 +2873,6 @@ out_putkey:
        return ret;
 pi_faulted:
-        spin_unlock(&hb->lock);
        put_futex_key(&key);
        ret = fault_in_user_writeable(uaddr);
@@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
+        struct futex_pi_state *pi_state = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
        union futex_key key2 = FUTEX_KEY_INIT;
@@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * The waiter is allocated on our stack, manipulated by the requeue
         * code while we sleep on uaddr.
         */
-        debug_rt_mutex_init_waiter(&rt_waiter);
+        rt_mutex_init_waiter(&rt_waiter);
-        RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
-        RB_CLEAR_NODE(&rt_waiter.tree_entry);
-        rt_waiter.task = NULL;
        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
@@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
                        ret = fixup_pi_state_owner(uaddr2, &q, current);
-                        if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
+                        if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
-                                rt_mutex_unlock(&q.pi_state->pi_mutex);
+                                pi_state = q.pi_state;
+                                get_pi_state(pi_state);
+                        }
                        /*
                         * Drop the reference to the pi state which
                         * the requeue_pi() code acquired for us.
@@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 */
                WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
-                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
+                ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
-                debug_rt_mutex_free_waiter(&rt_waiter);
                spin_lock(q.lock_ptr);
+                if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+                        ret = 0;
+                debug_rt_mutex_free_waiter(&rt_waiter);
                /*
                 * Fixup the pi_state owner and possibly acquire the lock if we
                 * haven't already.
@@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 * the fault, unlock the rt_mutex and return the fault to
                 * userspace.
                 */
-                if (ret && rt_mutex_owner(pi_mutex) == current)
+                if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
-                        rt_mutex_unlock(pi_mutex);
+                        pi_state = q.pi_state;
+                        get_pi_state(pi_state);
+                }
                /* Unqueue and drop the lock. */
                unqueue_me_pi(&q);
        }
+        if (pi_state) {
+                rt_mutex_futex_unlock(&pi_state->pi_mutex);
+                put_pi_state(pi_state);
+        }
        if (ret == -EINTR) {
                /*
                 * We've already been requeued, but cannot restart by calling
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 2f9df37940a0..c51a49c9be70 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
 }
 EXPORT_SYMBOL(__gcov_merge_icall_topn);
+void __gcov_exit(void)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
 /**
 * gcov_enable_events - enable event reporting through gcov_event()
 *
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 6a5c239c7669..46a18e72bce6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
 #include <linux/vmalloc.h>
 #include "gcov.h"
-#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
+#if (__GNUC__ >= 7)
+#define GCOV_COUNTERS                   9
+#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
 #define GCOV_COUNTERS                   10
 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
 #define GCOV_COUNTERS                   9
diff --git a/kernel/groups.c b/kernel/groups.c
index 8dd7a61b7115..d09727692a2a 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -18,7 +18,7 @@ struct group_info *groups_alloc(int gidsetsize)
        len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
        gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
        if (!gi)
-                gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL);
+                gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL);
        if (!gi)
                return NULL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f0f8e2a8496f..751593ed7c0b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
 int __read_mostly sysctl_hung_task_warnings = 10;
 static int __read_mostly did_panic;
+static bool hung_task_show_lock;
 static struct task_struct *watchdog_task;
@@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
                pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
                        " disables this message.\n");
                sched_show_task(t);
-                debug_show_all_locks();
+                hung_task_show_lock = true;
        }
        touch_nmi_watchdog();
        if (sysctl_hung_task_panic) {
+                if (hung_task_show_lock)
+                        debug_show_all_locks();
                trigger_all_cpu_backtrace();
                panic("hung_task: blocked tasks");
        }
@@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
        if (test_taint(TAINT_DIE) || did_panic)
                return;
+        hung_task_show_lock = false;
        rcu_read_lock();
        for_each_process_thread(g, t) {
                if (!max_count--)
@@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
        }
 unlock:
        rcu_read_unlock();
+        if (hung_task_show_lock)
+                debug_show_all_locks();
 }
 static long hung_timeout_jiffies(unsigned long last_checked,
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index be3c34e4f2ac..c94da688ee9b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq)
        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock_irq(&desc->lock);
-        action_ret = action->thread_fn(action->irq, action->dev_id);
+        action_ret = IRQ_NONE;
+        for_each_action_of_desc(desc, action)
+                action_ret |= action->thread_fn(action->irq, action->dev_id);
        if (!noirqdebug)
                note_interrupt(desc, action_ret);
@@ -877,8 +880,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
        if (!desc)
                return;
-        __irq_do_set_handler(desc, handle, 1, NULL);
        desc->irq_common_data.handler_data = data;
+        __irq_do_set_handler(desc, handle, 1, NULL);
        irq_put_desc_busunlock(desc, flags);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4afe5cc5af1..070be980c37a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -852,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
         * This code is triggered unconditionally. Check the affinity
         * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
         */
-        if (desc->irq_common_data.affinity)
+        if (cpumask_available(desc->irq_common_data.affinity))
                cpumask_copy(mask, desc->irq_common_data.affinity);
        else
                valid = false;
@@ -1212,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 * set the trigger type must match. Also all must
                 * agree on ONESHOT.
                 */
+                unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
                if (!((old->flags & new->flags) & IRQF_SHARED) ||
-                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+                    (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
                    ((old->flags ^ new->flags) & IRQF_ONESHOT))
                        goto mismatch;
@@ -1557,7 +1559,7 @@ void remove_irq(unsigned int irq, struct irqaction *act)
        struct irq_desc *desc = irq_to_desc(irq);
        if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
-            __free_irq(irq, act->dev_id);
+                __free_irq(irq, act->dev_id);
 }
 EXPORT_SYMBOL_GPL(remove_irq);
@@ -1574,20 +1576,27 @@ EXPORT_SYMBOL_GPL(remove_irq);
 *      have completed.
 *
 *      This function must not be called from interrupt context.
+ *
+ *      Returns the devname argument passed to request_irq.
 */
-void free_irq(unsigned int irq, void *dev_id)
+const void *free_irq(unsigned int irq, void *dev_id)
 {
        struct irq_desc *desc = irq_to_desc(irq);
+        struct irqaction *action;
+        const char *devname;
        if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
-                return;
+                return NULL;
 #ifdef CONFIG_SMP
        if (WARN_ON(desc->affinity_notify))
                desc->affinity_notify = NULL;
 #endif
-        kfree(__free_irq(irq, dev_id));
+        action = __free_irq(irq, dev_id);
+        devname = action->name;
+        kfree(action);
+        return devname;
 }
 EXPORT_SYMBOL(free_irq);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 85e5546cd791..cd771993f96f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void)
        /*
         * We are interested in code coverage as a function of a syscall inputs,
         * so we ignore code executed in interrupts.
-         * The checks for whether we are in an interrupt are open-coded, because
-         * 1. We can't use in_interrupt() here, since it also returns true
-         *    when we are inside local_bh_disable() section.
-         * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
-         *    since that leads to slower generated code (three separate tests,
-         *    one for each of the flags).
         */
-        if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+        if (!t || !in_task())
-                                                        | NMI_MASK)))
                return;
        mode = READ_ONCE(t->kcov_mode);
        if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5b3872..ae1a3ba24df5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex);
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
@@ -996,34 +990,6 @@ unlock:
        return ret;
 }
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-                            size_t data_len)
-{
-        struct elf_note note;
-        note.n_namesz = strlen(name) + 1;
-        note.n_descsz = data_len;
-        note.n_type   = type;
-        memcpy(buf, &note, sizeof(note));
-        buf += (sizeof(note) + 3)/4;
-        memcpy(buf, name, note.n_namesz);
-        buf += (note.n_namesz + 3)/4;
-        memcpy(buf, data, note.n_descsz);
-        buf += (note.n_descsz + 3)/4;
-        return buf;
-}
-static void final_note(u32 *buf)
-{
-        struct elf_note note;
-        note.n_namesz = 0;
-        note.n_descsz = 0;
-        note.n_type   = 0;
-        memcpy(buf, &note, sizeof(note));
-}
 void crash_save_cpu(struct pt_regs *regs, int cpu)
 {
        struct elf_prstatus prstatus;
@@ -1085,403 +1051,6 @@ subsys_initcall(crash_notes_memory_init);
 /*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-                                        unsigned long long system_ram,
-                                        unsigned long long *crash_size,
-                                        unsigned long long *crash_base)
-{
-        char *cur = cmdline, *tmp;
-        /* for each entry of the comma-separated list */
-        do {
-                unsigned long long start, end = ULLONG_MAX, size;
-                /* get the start of the range */
-                start = memparse(cur, &tmp);
-                if (cur == tmp) {
-                        pr_warn("crashkernel: Memory value expected\n");
-                        return -EINVAL;
-                }
-                cur = tmp;
-                if (*cur != '-') {
-                        pr_warn("crashkernel: '-' expected\n");
-                        return -EINVAL;
-                }
-                cur++;
-                /* if no ':' is here, than we read the end */
-                if (*cur != ':') {
-                        end = memparse(cur, &tmp);
-                        if (cur == tmp) {
-                                pr_warn("crashkernel: Memory value expected\n");
-                                return -EINVAL;
-                        }
-                        cur = tmp;
-                        if (end <= start) {
-                                pr_warn("crashkernel: end <= start\n");
-                                return -EINVAL;
-                        }
-                }
-                if (*cur != ':') {
-                        pr_warn("crashkernel: ':' expected\n");
-                        return -EINVAL;
-                }
-                cur++;
-                size = memparse(cur, &tmp);
-                if (cur == tmp) {
-                        pr_warn("Memory value expected\n");
-                        return -EINVAL;
-                }
-                cur = tmp;
-                if (size >= system_ram) {
-                        pr_warn("crashkernel: invalid size\n");
-                        return -EINVAL;
-                }
-                /* match ? */
-                if (system_ram >= start && system_ram < end) {
-                        *crash_size = size;
-                        break;
-                }
-        } while (*cur++ == ',');
-        if (*crash_size > 0) {
-                while (*cur && *cur != ' ' && *cur != '@')
-                        cur++;
-                if (*cur == '@') {
-                        cur++;
-                        *crash_base = memparse(cur, &tmp);
-                        if (cur == tmp) {
-                                pr_warn("Memory value expected after '@'\n");
-                                return -EINVAL;
-                        }
-                }
-        }
-        return 0;
-}
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *      crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-                                           unsigned long long *crash_size,
-                                           unsigned long long *crash_base)
-{
-        char *cur = cmdline;
-        *crash_size = memparse(cmdline, &cur);
-        if (cmdline == cur) {
-                pr_warn("crashkernel: memory value expected\n");
-                return -EINVAL;
-        }
-        if (*cur == '@')
-                *crash_base = memparse(cur+1, &cur);
-        else if (*cur != ' ' && *cur != '\0') {
-                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-                return -EINVAL;
-        }
-        return 0;
-}
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-        [SUFFIX_HIGH] = ",high",
-        [SUFFIX_LOW]  = ",low",
-        [SUFFIX_NULL] = NULL,
-};
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *      crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-                                           unsigned long long   *crash_size,
-                                           const char *suffix)
-{
-        char *cur = cmdline;
-        *crash_size = memparse(cmdline, &cur);
-        if (cmdline == cur) {
-                pr_warn("crashkernel: memory value expected\n");
-                return -EINVAL;
-        }
-        /* check with suffix */
-        if (strncmp(cur, suffix, strlen(suffix))) {
-                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-                return -EINVAL;
-        }
-        cur += strlen(suffix);
-        if (*cur != ' ' && *cur != '\0') {
-                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-                return -EINVAL;
-        }
-        return 0;
-}
-static __init char *get_last_crashkernel(char *cmdline,
-                             const char *name,
-                             const char *suffix)
-{
-        char *p = cmdline, *ck_cmdline = NULL;
-        /* find crashkernel and use the last one if there are more */
-        p = strstr(p, name);
-        while (p) {
-                char *end_p = strchr(p, ' ');
-                char *q;
-                if (!end_p)
-                        end_p = p + strlen(p);
-                if (!suffix) {
-                        int i;
-                        /* skip the one with any known suffix */
-                        for (i = 0; suffix_tbl[i]; i++) {
-                                q = end_p - strlen(suffix_tbl[i]);
-                                if (!strncmp(q, suffix_tbl[i],
-                                             strlen(suffix_tbl[i])))
-                                        goto next;
-                        }
-                        ck_cmdline = p;
-                } else {
-                        q = end_p - strlen(suffix);
-                        if (!strncmp(q, suffix, strlen(suffix)))
-                                ck_cmdline = p;
-                }
-next:
-                p = strstr(p+1, name);
-        }
-        if (!ck_cmdline)
-                return NULL;
-        return ck_cmdline;
-}
-static int __init __parse_crashkernel(char *cmdline,
-                             unsigned long long system_ram,
-                             unsigned long long *crash_size,
-                             unsigned long long *crash_base,
-                             const char *name,
-                             const char *suffix)
-{
-        char    *first_colon, *first_space;
-        char    *ck_cmdline;
-        BUG_ON(!crash_size || !crash_base);
-        *crash_size = 0;
-        *crash_base = 0;
-        ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-        if (!ck_cmdline)
-                return -EINVAL;
-        ck_cmdline += strlen(name);
-        if (suffix)
-                return parse_crashkernel_suffix(ck_cmdline, crash_size,
-                                suffix);
-        /*
-         * if the commandline contains a ':', then that's the extended
-         * syntax -- if not, it must be the classic syntax
-         */
-        first_colon = strchr(ck_cmdline, ':');
-        first_space = strchr(ck_cmdline, ' ');
-        if (first_colon && (!first_space || first_colon < first_space))
-                return parse_crashkernel_mem(ck_cmdline, system_ram,
-                                crash_size, crash_base);
-        return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-                             unsigned long long system_ram,
-                             unsigned long long *crash_size,
-                             unsigned long long *crash_base)
-{
-        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                        "crashkernel=", NULL);
-}
-int __init parse_crashkernel_high(char *cmdline,
-                             unsigned long long system_ram,
-                             unsigned long long *crash_size,
-                             unsigned long long *crash_base)
-{
-        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-int __init parse_crashkernel_low(char *cmdline,
-                             unsigned long long system_ram,
-                             unsigned long long *crash_size,
-                             unsigned long long *crash_base)
-{
-        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-static void update_vmcoreinfo_note(void)
-{
-        u32 *buf = vmcoreinfo_note;
-        if (!vmcoreinfo_size)
-                return;
-        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-                              vmcoreinfo_size);
-        final_note(buf);
-}
-void crash_save_vmcoreinfo(void)
-{
-        vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-        update_vmcoreinfo_note();
-}
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-        va_list args;
-        char buf[0x50];
-        size_t r;
-        va_start(args, fmt);
-        r = vscnprintf(buf, sizeof(buf), fmt, args);
-        va_end(args);
-        r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-        memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-        vmcoreinfo_size += r;
-}
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
-        return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
-}
-static int __init crash_save_vmcoreinfo_init(void)
-{
-        VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-        VMCOREINFO_PAGESIZE(PAGE_SIZE);
-        VMCOREINFO_SYMBOL(init_uts_ns);
-        VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-        VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-        VMCOREINFO_SYMBOL(_stext);
-        VMCOREINFO_SYMBOL(vmap_area_list);
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-        VMCOREINFO_SYMBOL(mem_map);
-        VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-        VMCOREINFO_SYMBOL(mem_section);
-        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-        VMCOREINFO_STRUCT_SIZE(mem_section);
-        VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-        VMCOREINFO_STRUCT_SIZE(page);
-        VMCOREINFO_STRUCT_SIZE(pglist_data);
-        VMCOREINFO_STRUCT_SIZE(zone);
-        VMCOREINFO_STRUCT_SIZE(free_area);
-        VMCOREINFO_STRUCT_SIZE(list_head);
-        VMCOREINFO_SIZE(nodemask_t);
-        VMCOREINFO_OFFSET(page, flags);
-        VMCOREINFO_OFFSET(page, _refcount);
-        VMCOREINFO_OFFSET(page, mapping);
-        VMCOREINFO_OFFSET(page, lru);
-        VMCOREINFO_OFFSET(page, _mapcount);
-        VMCOREINFO_OFFSET(page, private);
-        VMCOREINFO_OFFSET(page, compound_dtor);
-        VMCOREINFO_OFFSET(page, compound_order);
-        VMCOREINFO_OFFSET(page, compound_head);
-        VMCOREINFO_OFFSET(pglist_data, node_zones);
-        VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-        VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-        VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-        VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-        VMCOREINFO_OFFSET(pglist_data, node_id);
-        VMCOREINFO_OFFSET(zone, free_area);
-        VMCOREINFO_OFFSET(zone, vm_stat);
-        VMCOREINFO_OFFSET(zone, spanned_pages);
-        VMCOREINFO_OFFSET(free_area, free_list);
-        VMCOREINFO_OFFSET(list_head, next);
-        VMCOREINFO_OFFSET(list_head, prev);
-        VMCOREINFO_OFFSET(vmap_area, va_start);
-        VMCOREINFO_OFFSET(vmap_area, list);
-        VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-        log_buf_kexec_setup();
-        VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-        VMCOREINFO_NUMBER(NR_FREE_PAGES);
-        VMCOREINFO_NUMBER(PG_lru);
-        VMCOREINFO_NUMBER(PG_private);
-        VMCOREINFO_NUMBER(PG_swapcache);
-        VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-        VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-        VMCOREINFO_NUMBER(PG_head_mask);
-        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
-        VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
-#endif
-        arch_crash_save_vmcoreinfo();
-        update_vmcoreinfo_note();
-        return 0;
-}
-subsys_initcall(crash_save_vmcoreinfo_init);
-/*
 * Move into place and start executing a preloaded standalone
 * executable.  If nothing was preloaded return an error.
 */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 699c5bc51a92..2d2d3a568e4e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -58,15 +58,6 @@
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
-/*
- * Some oddball architectures like 64bit powerpc have function descriptors
- * so this must be overridable.
- */
-#ifndef kprobe_lookup_name
-#define kprobe_lookup_name(name, addr) \
-        addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
-#endif
 static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -81,6 +72,12 @@ static struct {
        raw_spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
+kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
+                                        unsigned int __unused)
+{
+        return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
+}
 static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 {
        return &(kretprobe_table_locks[hash].lock);
@@ -598,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work)
 }
 /* Wait for completing optimization and unoptimization */
-static void wait_for_kprobe_optimizer(void)
+void wait_for_kprobe_optimizer(void)
 {
        mutex_lock(&kprobe_mutex);
@@ -746,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p)
        arch_remove_optimized_kprobe(op);
 }
+static inline
+void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
+{
+        if (!kprobe_ftrace(p))
+                arch_prepare_optimized_kprobe(op, p);
+}
 /* Try to prepare optimized instructions */
 static void prepare_optimized_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
        op = container_of(p, struct optimized_kprobe, kp);
-        arch_prepare_optimized_kprobe(op, p);
+        __prepare_optimized_kprobe(op, p);
 }
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -766,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
        INIT_LIST_HEAD(&op->list);
        op->kp.addr = p->addr;
-        arch_prepare_optimized_kprobe(op, p);
+        __prepare_optimized_kprobe(op, p);
        return &op->kp;
 }
@@ -1391,21 +1395,19 @@ bool within_kprobe_blacklist(unsigned long addr)
 * This returns encoded errors if it fails to look up symbol or invalid
 * combination of parameters.
 */
-static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
+                        const char *symbol_name, unsigned int offset)
 {
-        kprobe_opcode_t *addr = p->addr;
+        if ((symbol_name && addr) || (!symbol_name && !addr))
-        if ((p->symbol_name && p->addr) ||
-            (!p->symbol_name && !p->addr))
                goto invalid;
-        if (p->symbol_name) {
+        if (symbol_name) {
-                kprobe_lookup_name(p->symbol_name, addr);
+                addr = kprobe_lookup_name(symbol_name, offset);
                if (!addr)
                        return ERR_PTR(-ENOENT);
        }
-        addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
+        addr = (kprobe_opcode_t *)(((char *)addr) + offset);
        if (addr)
                return addr;
@@ -1413,6 +1415,11 @@ invalid:
        return ERR_PTR(-EINVAL);
 }
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+        return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+}
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
 static struct kprobe *__get_valid_kprobe(struct kprobe *p)
 {
@@ -1740,11 +1747,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
 }
 EXPORT_SYMBOL_GPL(unregister_kprobes);
-int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+int __weak kprobe_exceptions_notify(struct notifier_block *self,
-                                              unsigned long val, void *data)
+                                        unsigned long val, void *data)
 {
        return NOTIFY_DONE;
 }
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
 static struct notifier_block kprobe_exceptions_nb = {
        .notifier_call = kprobe_exceptions_notify,
@@ -1875,6 +1883,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
+bool __weak arch_function_offset_within_entry(unsigned long offset)
+{
+        return !offset;
+}
+bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+{
+        kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+        if (IS_ERR(kp_addr))
+                return false;
+        if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
+                                                !arch_function_offset_within_entry(offset))
+                return false;
+        return true;
+}
 int register_kretprobe(struct kretprobe *rp)
 {
        int ret = 0;
@@ -1882,6 +1909,9 @@ int register_kretprobe(struct kretprobe *rp)
        int i;
        void *addr;
+        if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+                return -EINVAL;
        if (kretprobe_blacklist_size) {
                addr = kprobe_addr(&rp->kp);
                if (IS_ERR(addr))
@@ -2153,6 +2183,12 @@ static int kprobes_module_callback(struct notifier_block *nb,
                                 * The vaddr this probe is installed will soon
                                 * be vfreed buy not synced to disk. Hence,
                                 * disarming the breakpoint isn't needed.
+                                 *
+                                 * Note, this will also move any optimized probes
+                                 * that are pending to be removed from their
+                                 * corresponding lists to the freeing_list and
+                                 * will not be touched by the delayed
+                                 * kprobe_optimizer work handler.
                                 */
                                kill_kprobe(p);
                        }
@@ -2192,8 +2228,8 @@ static int __init init_kprobes(void)
        if (kretprobe_blacklist_size) {
                /* lookup the function address from its name */
                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
-                        kprobe_lookup_name(kretprobe_blacklist[i].name,
+                        kretprobe_blacklist[i].addr =
-                                           kretprobe_blacklist[i].addr);
+                                kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
                        if (!kretprobe_blacklist[i].addr)
                                printk("kretprobe: lookup failed: %s\n",
                                       kretprobe_blacklist[i].name);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0999679d6f26..23cd70651238 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
 }
 KERNEL_ATTR_RW(kexec_crash_size);
+#endif /* CONFIG_KEXEC_CORE */
+#ifdef CONFIG_CRASH_CORE
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_CORE */
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = {
        &kexec_loaded_attr.attr,
        &kexec_crash_loaded_attr.attr,
        &kexec_crash_size_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_CORE
        &vmcoreinfo_attr.attr,
 #endif
 #ifndef CONFIG_TINY_RCU
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index e8780c0901d9..2b8bdb1925da 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
-livepatch-objs := core.o
+livepatch-objs := core.o patch.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index af4643873e71..b9628e43c78f 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -24,61 +24,31 @@
 #include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/ftrace.h>
 #include <linux/list.h>
 #include <linux/kallsyms.h>
 #include <linux/livepatch.h>
 #include <linux/elf.h>
 #include <linux/moduleloader.h>
+#include <linux/completion.h>
 #include <asm/cacheflush.h>
+#include "core.h"
-/**
+#include "patch.h"
- * struct klp_ops - structure for tracking registered ftrace ops structs
+#include "transition.h"
- *
- * A single ftrace_ops is shared between all enabled replacement functions
- * (klp_func structs) which have the same old_addr.  This allows the switch
- * between function versions to happen instantaneously by updating the klp_ops
- * struct's func_stack list.  The winner is the klp_func at the top of the
- * func_stack (front of the list).
- *
- * @node:       node for the global klp_ops list
- * @func_stack: list head for the stack of klp_func's (active func is on top)
- * @fops:       registered ftrace ops struct
- */
-struct klp_ops {
-        struct list_head node;
-        struct list_head func_stack;
-        struct ftrace_ops fops;
-};
 /*
- * The klp_mutex protects the global lists and state transitions of any
+ * klp_mutex is a coarse lock which serializes access to klp data.  All
- * structure reachable from them.  References to any structure must be obtained
+ * accesses to klp-related variables and structures must have mutex protection,
- * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
+ * except within the following functions which carefully avoid the need for it:
- * ensure it gets consistent data).
+ *
+ * - klp_ftrace_handler()
+ * - klp_update_patch_state()
 */
-static DEFINE_MUTEX(klp_mutex);
+DEFINE_MUTEX(klp_mutex);
 static LIST_HEAD(klp_patches);
-static LIST_HEAD(klp_ops);
 static struct kobject *klp_root_kobj;
-static struct klp_ops *klp_find_ops(unsigned long old_addr)
-{
-        struct klp_ops *ops;
-        struct klp_func *func;
-        list_for_each_entry(ops, &klp_ops, node) {
-                func = list_first_entry(&ops->func_stack, struct klp_func,
-                                        stack_node);
-                if (func->old_addr == old_addr)
-                        return ops;
-        }
-        return NULL;
-}
 static bool klp_is_module(struct klp_object *obj)
 {
        return obj->name;
@@ -117,7 +87,6 @@ static void klp_find_object_module(struct klp_object *obj)
        mutex_unlock(&module_mutex);
 }
-/* klp_mutex must be held by caller */
 static bool klp_is_patch_registered(struct klp_patch *patch)
 {
        struct klp_patch *mypatch;
@@ -182,7 +151,10 @@ static int klp_find_object_symbol(const char *objname, const char *name,
        };
        mutex_lock(&module_mutex);
-        kallsyms_on_each_symbol(klp_find_callback, &args);
+        if (objname)
+                module_kallsyms_on_each_symbol(klp_find_callback, &args);
+        else
+                kallsyms_on_each_symbol(klp_find_callback, &args);
        mutex_unlock(&module_mutex);
        /*
@@ -233,7 +205,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
        for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
                sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info);
                if (sym->st_shndx != SHN_LIVEPATCH) {
-                        pr_err("symbol %s is not marked as a livepatch symbol",
+                        pr_err("symbol %s is not marked as a livepatch symbol\n",
                               strtab + sym->st_name);
                        return -EINVAL;
                }
@@ -243,7 +215,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
                             ".klp.sym.%55[^.].%127[^,],%lu",
                             objname, symname, &sympos);
                if (cnt != 3) {
-                        pr_err("symbol %s has an incorrectly formatted name",
+                        pr_err("symbol %s has an incorrectly formatted name\n",
                               strtab + sym->st_name);
                        return -EINVAL;
                }
@@ -288,7 +260,7 @@ static int klp_write_object_relocations(struct module *pmod,
                 */
                cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname);
                if (cnt != 1) {
-                        pr_err("section %s has an incorrectly formatted name",
+                        pr_err("section %s has an incorrectly formatted name\n",
                               secname);
                        ret = -EINVAL;
                        break;
@@ -311,191 +283,30 @@ static int klp_write_object_relocations(struct module *pmod,
        return ret;
 }
-static void notrace klp_ftrace_handler(unsigned long ip,
-                                       unsigned long parent_ip,
-                                       struct ftrace_ops *fops,
-                                       struct pt_regs *regs)
-{
-        struct klp_ops *ops;
-        struct klp_func *func;
-        ops = container_of(fops, struct klp_ops, fops);
-        rcu_read_lock();
-        func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
-                                      stack_node);
-        if (WARN_ON_ONCE(!func))
-                goto unlock;
-        klp_arch_set_pc(regs, (unsigned long)func->new_func);
-unlock:
-        rcu_read_unlock();
-}
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
-        return faddr;
-}
-#endif
-static void klp_disable_func(struct klp_func *func)
-{
-        struct klp_ops *ops;
-        if (WARN_ON(func->state != KLP_ENABLED))
-                return;
-        if (WARN_ON(!func->old_addr))
-                return;
-        ops = klp_find_ops(func->old_addr);
-        if (WARN_ON(!ops))
-                return;
-        if (list_is_singular(&ops->func_stack)) {
-                unsigned long ftrace_loc;
-                ftrace_loc = klp_get_ftrace_location(func->old_addr);
-                if (WARN_ON(!ftrace_loc))
-                        return;
-                WARN_ON(unregister_ftrace_function(&ops->fops));
-                WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
-                list_del_rcu(&func->stack_node);
-                list_del(&ops->node);
-                kfree(ops);
-        } else {
-                list_del_rcu(&func->stack_node);
-        }
-        func->state = KLP_DISABLED;
-}
-static int klp_enable_func(struct klp_func *func)
-{
-        struct klp_ops *ops;
-        int ret;
-        if (WARN_ON(!func->old_addr))
-                return -EINVAL;
-        if (WARN_ON(func->state != KLP_DISABLED))
-                return -EINVAL;
-        ops = klp_find_ops(func->old_addr);
-        if (!ops) {
-                unsigned long ftrace_loc;
-                ftrace_loc = klp_get_ftrace_location(func->old_addr);
-                if (!ftrace_loc) {
-                        pr_err("failed to find location for function '%s'\n",
-                                func->old_name);
-                        return -EINVAL;
-                }
-                ops = kzalloc(sizeof(*ops), GFP_KERNEL);
-                if (!ops)
-                        return -ENOMEM;
-                ops->fops.func = klp_ftrace_handler;
-                ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
-                                  FTRACE_OPS_FL_DYNAMIC |
-                                  FTRACE_OPS_FL_IPMODIFY;
-                list_add(&ops->node, &klp_ops);
-                INIT_LIST_HEAD(&ops->func_stack);
-                list_add_rcu(&func->stack_node, &ops->func_stack);
-                ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
-                if (ret) {
-                        pr_err("failed to set ftrace filter for function '%s' (%d)\n",
-                               func->old_name, ret);
-                        goto err;
-                }
-                ret = register_ftrace_function(&ops->fops);
-                if (ret) {
-                        pr_err("failed to register ftrace handler for function '%s' (%d)\n",
-                               func->old_name, ret);
-                        ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
-                        goto err;
-                }
-        } else {
-                list_add_rcu(&func->stack_node, &ops->func_stack);
-        }
-        func->state = KLP_ENABLED;
-        return 0;
-err:
-        list_del_rcu(&func->stack_node);
-        list_del(&ops->node);
-        kfree(ops);
-        return ret;
-}
-static void klp_disable_object(struct klp_object *obj)
-{
-        struct klp_func *func;
-        klp_for_each_func(obj, func)
-                if (func->state == KLP_ENABLED)
-                        klp_disable_func(func);
-        obj->state = KLP_DISABLED;
-}
-static int klp_enable_object(struct klp_object *obj)
-{
-        struct klp_func *func;
-        int ret;
-        if (WARN_ON(obj->state != KLP_DISABLED))
-                return -EINVAL;
-        if (WARN_ON(!klp_is_object_loaded(obj)))
-                return -EINVAL;
-        klp_for_each_func(obj, func) {
-                ret = klp_enable_func(func);
-                if (ret) {
-                        klp_disable_object(obj);
-                        return ret;
-                }
-        }
-        obj->state = KLP_ENABLED;
-        return 0;
-}
 static int __klp_disable_patch(struct klp_patch *patch)
 {
-        struct klp_object *obj;
+        if (klp_transition_patch)
+                return -EBUSY;
        /* enforce stacking: only the last enabled patch can be disabled */
        if (!list_is_last(&patch->list, &klp_patches) &&
-            list_next_entry(patch, list)->state == KLP_ENABLED)
+            list_next_entry(patch, list)->enabled)
                return -EBUSY;
-        pr_notice("disabling patch '%s'\n", patch->mod->name);
+        klp_init_transition(patch, KLP_UNPATCHED);
-        klp_for_each_object(patch, obj) {
+        /*
-                if (obj->state == KLP_ENABLED)
+         * Enforce the order of the func->transition writes in
-                        klp_disable_object(obj);
+         * klp_init_transition() and the TIF_PATCH_PENDING writes in
-        }
+         * klp_start_transition().  In the rare case where klp_ftrace_handler()
+         * is called shortly after klp_update_patch_state() switches the task,
+         * this ensures the handler sees that func->transition is set.
+         */
+        smp_wmb();
-        patch->state = KLP_DISABLED;
+        klp_start_transition();
+        klp_try_complete_transition();
+        patch->enabled = false;
        return 0;
 }
@@ -519,7 +330,7 @@ int klp_disable_patch(struct klp_patch *patch)
                goto err;
        }
-        if (patch->state == KLP_DISABLED) {
+        if (!patch->enabled) {
                ret = -EINVAL;
                goto err;
        }
@@ -537,32 +348,61 @@ static int __klp_enable_patch(struct klp_patch *patch)
        struct klp_object *obj;
        int ret;
-        if (WARN_ON(patch->state != KLP_DISABLED))
+        if (klp_transition_patch)
+                return -EBUSY;
+        if (WARN_ON(patch->enabled))
                return -EINVAL;
        /* enforce stacking: only the first disabled patch can be enabled */
        if (patch->list.prev != &klp_patches &&
-            list_prev_entry(patch, list)->state == KLP_DISABLED)
+            !list_prev_entry(patch, list)->enabled)
                return -EBUSY;
+        /*
+         * A reference is taken on the patch module to prevent it from being
+         * unloaded.
+         *
+         * Note: For immediate (no consistency model) patches we don't allow
+         * patch modules to unload since there is no safe/sane method to
+         * determine if a thread is still running in the patched code contained
+         * in the patch module once the ftrace registration is successful.
+         */
+        if (!try_module_get(patch->mod))
+                return -ENODEV;
        pr_notice("enabling patch '%s'\n", patch->mod->name);
+        klp_init_transition(patch, KLP_PATCHED);
+        /*
+         * Enforce the order of the func->transition writes in
+         * klp_init_transition() and the ops->func_stack writes in
+         * klp_patch_object(), so that klp_ftrace_handler() will see the
+         * func->transition updates before the handler is registered and the
+         * new funcs become visible to the handler.
+         */
+        smp_wmb();
        klp_for_each_object(patch, obj) {
                if (!klp_is_object_loaded(obj))
                        continue;
-                ret = klp_enable_object(obj);
+                ret = klp_patch_object(obj);
-                if (ret)
+                if (ret) {
-                        goto unregister;
+                        pr_warn("failed to enable patch '%s'\n",
+                                patch->mod->name);
+                        klp_cancel_transition();
+                        return ret;
+                }
        }
-        patch->state = KLP_ENABLED;
+        klp_start_transition();
+        klp_try_complete_transition();
+        patch->enabled = true;
        return 0;
-unregister:
-        WARN_ON(__klp_disable_patch(patch));
-        return ret;
 }
 /**
@@ -599,6 +439,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
 * /sys/kernel/livepatch
 * /sys/kernel/livepatch/<patch>
 * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/transition
 * /sys/kernel/livepatch/<patch>/<object>
 * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
 */
@@ -608,26 +449,34 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
 {
        struct klp_patch *patch;
        int ret;
-        unsigned long val;
+        bool enabled;
-        ret = kstrtoul(buf, 10, &val);
+        ret = kstrtobool(buf, &enabled);
        if (ret)
-                return -EINVAL;
+                return ret;
-        if (val != KLP_DISABLED && val != KLP_ENABLED)
-                return -EINVAL;
        patch = container_of(kobj, struct klp_patch, kobj);
        mutex_lock(&klp_mutex);
-        if (val == patch->state) {
+        if (!klp_is_patch_registered(patch)) {
+                /*
+                 * Module with the patch could either disappear meanwhile or is
+                 * not properly initialized yet.
+                 */
+                ret = -EINVAL;
+                goto err;
+        }
+        if (patch->enabled == enabled) {
                /* already in requested state */
                ret = -EINVAL;
                goto err;
        }
-        if (val == KLP_ENABLED) {
+        if (patch == klp_transition_patch) {
+                klp_reverse_transition();
+        } else if (enabled) {
                ret = __klp_enable_patch(patch);
                if (ret)
                        goto err;
@@ -652,21 +501,33 @@ static ssize_t enabled_show(struct kobject *kobj,
        struct klp_patch *patch;
        patch = container_of(kobj, struct klp_patch, kobj);
-        return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+        return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled);
+}
+static ssize_t transition_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf)
+{
+        struct klp_patch *patch;
+        patch = container_of(kobj, struct klp_patch, kobj);
+        return snprintf(buf, PAGE_SIZE-1, "%d\n",
+                        patch == klp_transition_patch);
 }
 static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
 static struct attribute *klp_patch_attrs[] = {
        &enabled_kobj_attr.attr,
+        &transition_kobj_attr.attr,
        NULL
 };
 static void klp_kobj_release_patch(struct kobject *kobj)
 {
-        /*
+        struct klp_patch *patch;
-         * Once we have a consistency model we'll need to module_put() the
-         * patch module here.  See klp_register_patch() for more details.
+        patch = container_of(kobj, struct klp_patch, kobj);
-         */
+        complete(&patch->finish);
 }
 static struct kobj_type klp_ktype_patch = {
@@ -737,7 +598,6 @@ static void klp_free_patch(struct klp_patch *patch)
        klp_free_objects_limited(patch, NULL);
        if (!list_empty(&patch->list))
                list_del(&patch->list);
-        kobject_put(&patch->kobj);
 }
 static int klp_init_func(struct klp_object *obj, struct klp_func *func)
@@ -746,7 +606,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
                return -EINVAL;
        INIT_LIST_HEAD(&func->stack_node);
-        func->state = KLP_DISABLED;
+        func->patched = false;
+        func->transition = false;
        /* The format for the sysfs directory is <function,sympos> where sympos
         * is the nth occurrence of this symbol in kallsyms for the patched
@@ -787,6 +648,22 @@ static int klp_init_object_loaded(struct klp_patch *patch,
                                             &func->old_addr);
                if (ret)
                        return ret;
+                ret = kallsyms_lookup_size_offset(func->old_addr,
+                                                  &func->old_size, NULL);
+                if (!ret) {
+                        pr_err("kallsyms size lookup failed for '%s'\n",
+                               func->old_name);
+                        return -ENOENT;
+                }
+                ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
+                                                  &func->new_size, NULL);
+                if (!ret) {
+                        pr_err("kallsyms size lookup failed for '%s' replacement\n",
+                               func->old_name);
+                        return -ENOENT;
+                }
        }
        return 0;
@@ -801,7 +678,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
        if (!obj->funcs)
                return -EINVAL;
-        obj->state = KLP_DISABLED;
+        obj->patched = false;
        obj->mod = NULL;
        klp_find_object_module(obj);
@@ -842,12 +719,15 @@ static int klp_init_patch(struct klp_patch *patch)
        mutex_lock(&klp_mutex);
-        patch->state = KLP_DISABLED;
+        patch->enabled = false;
+        init_completion(&patch->finish);
        ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
                                   klp_root_kobj, "%s", patch->mod->name);
-        if (ret)
+        if (ret) {
-                goto unlock;
+                mutex_unlock(&klp_mutex);
+                return ret;
+        }
        klp_for_each_object(patch, obj) {
                ret = klp_init_object(patch, obj);
@@ -863,9 +743,12 @@ static int klp_init_patch(struct klp_patch *patch)
 free:
        klp_free_objects_limited(patch, obj);
-        kobject_put(&patch->kobj);
-unlock:
        mutex_unlock(&klp_mutex);
+        kobject_put(&patch->kobj);
+        wait_for_completion(&patch->finish);
        return ret;
 }
@@ -879,23 +762,29 @@ unlock:
 */
 int klp_unregister_patch(struct klp_patch *patch)
 {
-        int ret = 0;
+        int ret;
        mutex_lock(&klp_mutex);
        if (!klp_is_patch_registered(patch)) {
                ret = -EINVAL;
-                goto out;
+                goto err;
        }
-        if (patch->state == KLP_ENABLED) {
+        if (patch->enabled) {
                ret = -EBUSY;
-                goto out;
+                goto err;
        }
        klp_free_patch(patch);
-out:
+        mutex_unlock(&klp_mutex);
+        kobject_put(&patch->kobj);
+        wait_for_completion(&patch->finish);
+        return 0;
+err:
        mutex_unlock(&klp_mutex);
        return ret;
 }
@@ -908,17 +797,18 @@ EXPORT_SYMBOL_GPL(klp_unregister_patch);
 * Initializes the data structure associated with the patch and
 * creates the sysfs interface.
 *
+ * There is no need to take the reference on the patch module here. It is done
+ * later when the patch is enabled.
+ *
 * Return: 0 on success, otherwise error
 */
 int klp_register_patch(struct klp_patch *patch)
 {
-        int ret;
        if (!patch || !patch->mod)
                return -EINVAL;
        if (!is_livepatch_module(patch->mod)) {
-                pr_err("module %s is not marked as a livepatch module",
+                pr_err("module %s is not marked as a livepatch module\n",
                       patch->mod->name);
                return -EINVAL;
        }
@@ -927,20 +817,16 @@ int klp_register_patch(struct klp_patch *patch)
                return -ENODEV;
        /*
-         * A reference is taken on the patch module to prevent it from being
+         * Architectures without reliable stack traces have to set
-         * unloaded.  Right now, we don't allow patch modules to unload since
+         * patch->immediate because there's currently no way to patch kthreads
-         * there is currently no method to determine if a thread is still
+         * with the consistency model.
-         * running in the patched code contained in the patch module once
-         * the ftrace registration is successful.
         */
-        if (!try_module_get(patch->mod))
+        if (!klp_have_reliable_stack() && !patch->immediate) {
-                return -ENODEV;
+                pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
+                return -ENOSYS;
-        ret = klp_init_patch(patch);
+        }
-        if (ret)
-                module_put(patch->mod);
-        return ret;
+        return klp_init_patch(patch);
 }
 EXPORT_SYMBOL_GPL(klp_register_patch);
@@ -975,13 +861,17 @@ int klp_module_coming(struct module *mod)
                                goto err;
                        }
-                        if (patch->state == KLP_DISABLED)
+                        /*
+                         * Only patch the module if the patch is enabled or is
+                         * in transition.
+                         */
+                        if (!patch->enabled && patch != klp_transition_patch)
                                break;
                        pr_notice("applying patch '%s' to loading module '%s'\n",
                                  patch->mod->name, obj->mod->name);
-                        ret = klp_enable_object(obj);
+                        ret = klp_patch_object(obj);
                        if (ret) {
                                pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
                                        patch->mod->name, obj->mod->name, ret);
@@ -1032,10 +922,14 @@ void klp_module_going(struct module *mod)
                        if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
                                continue;
-                        if (patch->state != KLP_DISABLED) {
+                        /*
+                         * Only unpatch the module if the patch is enabled or
+                         * is in transition.
+                         */
+                        if (patch->enabled || patch == klp_transition_patch) {
                                pr_notice("reverting patch '%s' on unloading module '%s'\n",
                                          patch->mod->name, obj->mod->name);
-                                klp_disable_object(obj);
+                                klp_unpatch_object(obj);
                        }
                        klp_free_object_loaded(obj);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
new file mode 100644
index 000000000000..c74f24c47837
--- /dev/null
+++ b/kernel/livepatch/core.h
@@ -0,0 +1,6 @@
+#ifndef _LIVEPATCH_CORE_H
+#define _LIVEPATCH_CORE_H
+extern struct mutex klp_mutex;
+#endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
new file mode 100644
index 000000000000..f8269036bf0b
--- /dev/null
+++ b/kernel/livepatch/patch.c
@@ -0,0 +1,272 @@
+/*
+ * patch.c - livepatch patching functions
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/livepatch.h>
+#include <linux/list.h>
+#include <linux/ftrace.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/bug.h>
+#include <linux/printk.h>
+#include "patch.h"
+#include "transition.h"
+static LIST_HEAD(klp_ops);
+struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+        struct klp_ops *ops;
+        struct klp_func *func;
+        list_for_each_entry(ops, &klp_ops, node) {
+                func = list_first_entry(&ops->func_stack, struct klp_func,
+                                        stack_node);
+                if (func->old_addr == old_addr)
+                        return ops;
+        }
+        return NULL;
+}
+static void notrace klp_ftrace_handler(unsigned long ip,
+                                       unsigned long parent_ip,
+                                       struct ftrace_ops *fops,
+                                       struct pt_regs *regs)
+{
+        struct klp_ops *ops;
+        struct klp_func *func;
+        int patch_state;
+        ops = container_of(fops, struct klp_ops, fops);
+        rcu_read_lock();
+        func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+                                      stack_node);
+        /*
+         * func should never be NULL because preemption should be disabled here
+         * and unregister_ftrace_function() does the equivalent of a
+         * synchronize_sched() before the func_stack removal.
+         */
+        if (WARN_ON_ONCE(!func))
+                goto unlock;
+        /*
+         * In the enable path, enforce the order of the ops->func_stack and
+         * func->transition reads.  The corresponding write barrier is in
+         * __klp_enable_patch().
+         *
+         * (Note that this barrier technically isn't needed in the disable
+         * path.  In the rare case where klp_update_patch_state() runs before
+         * this handler, its TIF_PATCH_PENDING read and this func->transition
+         * read need to be ordered.  But klp_update_patch_state() already
+         * enforces that.)
+         */
+        smp_rmb();
+        if (unlikely(func->transition)) {
+                /*
+                 * Enforce the order of the func->transition and
+                 * current->patch_state reads.  Otherwise we could read an
+                 * out-of-date task state and pick the wrong function.  The
+                 * corresponding write barrier is in klp_init_transition().
+                 */
+                smp_rmb();
+                patch_state = current->patch_state;
+                WARN_ON_ONCE(patch_state == KLP_UNDEFINED);
+                if (patch_state == KLP_UNPATCHED) {
+                        /*
+                         * Use the previously patched version of the function.
+                         * If no previous patches exist, continue with the
+                         * original function.
+                         */
+                        func = list_entry_rcu(func->stack_node.next,
+                                              struct klp_func, stack_node);
+                        if (&func->stack_node == &ops->func_stack)
+                                goto unlock;
+                }
+        }
+        klp_arch_set_pc(regs, (unsigned long)func->new_func);
+unlock:
+        rcu_read_unlock();
+}
+/*
+ * Convert a function address into the appropriate ftrace location.
+ *
+ * Usually this is just the address of the function, but on some architectures
+ * it's more complicated so allow them to provide a custom behaviour.
+ */
+#ifndef klp_get_ftrace_location
+static unsigned long klp_get_ftrace_location(unsigned long faddr)
+{
+        return faddr;
+}
+#endif
+static void klp_unpatch_func(struct klp_func *func)
+{
+        struct klp_ops *ops;
+        if (WARN_ON(!func->patched))
+                return;
+        if (WARN_ON(!func->old_addr))
+                return;
+        ops = klp_find_ops(func->old_addr);
+        if (WARN_ON(!ops))
+                return;
+        if (list_is_singular(&ops->func_stack)) {
+                unsigned long ftrace_loc;
+                ftrace_loc = klp_get_ftrace_location(func->old_addr);
+                if (WARN_ON(!ftrace_loc))
+                        return;
+                WARN_ON(unregister_ftrace_function(&ops->fops));
+                WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
+                list_del_rcu(&func->stack_node);
+                list_del(&ops->node);
+                kfree(ops);
+        } else {
+                list_del_rcu(&func->stack_node);
+        }
+        func->patched = false;
+}
+static int klp_patch_func(struct klp_func *func)
+{
+        struct klp_ops *ops;
+        int ret;
+        if (WARN_ON(!func->old_addr))
+                return -EINVAL;
+        if (WARN_ON(func->patched))
+                return -EINVAL;
+        ops = klp_find_ops(func->old_addr);
+        if (!ops) {
+                unsigned long ftrace_loc;
+                ftrace_loc = klp_get_ftrace_location(func->old_addr);
+                if (!ftrace_loc) {
+                        pr_err("failed to find location for function '%s'\n",
+                                func->old_name);
+                        return -EINVAL;
+                }
+                ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+                if (!ops)
+                        return -ENOMEM;
+                ops->fops.func = klp_ftrace_handler;
+                ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+                                  FTRACE_OPS_FL_DYNAMIC |
+                                  FTRACE_OPS_FL_IPMODIFY;
+                list_add(&ops->node, &klp_ops);
+                INIT_LIST_HEAD(&ops->func_stack);
+                list_add_rcu(&func->stack_node, &ops->func_stack);
+                ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
+                if (ret) {
+                        pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+                               func->old_name, ret);
+                        goto err;
+                }
+                ret = register_ftrace_function(&ops->fops);
+                if (ret) {
+                        pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+                               func->old_name, ret);
+                        ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
+                        goto err;
+                }
+        } else {
+                list_add_rcu(&func->stack_node, &ops->func_stack);
+        }
+        func->patched = true;
+        return 0;
+err:
+        list_del_rcu(&func->stack_node);
+        list_del(&ops->node);
+        kfree(ops);
+        return ret;
+}
+void klp_unpatch_object(struct klp_object *obj)
+{
+        struct klp_func *func;
+        klp_for_each_func(obj, func)
+                if (func->patched)
+                        klp_unpatch_func(func);
+        obj->patched = false;
+}
+int klp_patch_object(struct klp_object *obj)
+{
+        struct klp_func *func;
+        int ret;
+        if (WARN_ON(obj->patched))
+                return -EINVAL;
+        klp_for_each_func(obj, func) {
+                ret = klp_patch_func(func);
+                if (ret) {
+                        klp_unpatch_object(obj);
+                        return ret;
+                }
+        }
+        obj->patched = true;
+        return 0;
+}
+void klp_unpatch_objects(struct klp_patch *patch)
+{
+        struct klp_object *obj;
+        klp_for_each_object(patch, obj)
+                if (obj->patched)
+                        klp_unpatch_object(obj);
+}
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
new file mode 100644
index 000000000000..0db227170c36
--- /dev/null
+++ b/kernel/livepatch/patch.h
@@ -0,0 +1,33 @@
+#ifndef _LIVEPATCH_PATCH_H
+#define _LIVEPATCH_PATCH_H
+#include <linux/livepatch.h>
+#include <linux/list.h>
+#include <linux/ftrace.h>
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr.  This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list.  The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node:       node for the global klp_ops list
+ * @func_stack: list head for the stack of klp_func's (active func is on top)
+ * @fops:       registered ftrace ops struct
+ */
+struct klp_ops {
+        struct list_head node;
+        struct list_head func_stack;
+        struct ftrace_ops fops;
+};
+struct klp_ops *klp_find_ops(unsigned long old_addr);
+int klp_patch_object(struct klp_object *obj);
+void klp_unpatch_object(struct klp_object *obj);
+void klp_unpatch_objects(struct klp_patch *patch);
+#endif /* _LIVEPATCH_PATCH_H */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
new file mode 100644
index 000000000000..adc0cc64aa4b
--- /dev/null
+++ b/kernel/livepatch/transition.c
@@ -0,0 +1,553 @@
+/*
+ * transition.c - Kernel Live Patching transition functions
+ *
+ * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cpu.h>
+#include <linux/stacktrace.h>
+#include "core.h"
+#include "patch.h"
+#include "transition.h"
+#include "../sched/sched.h"
+#define MAX_STACK_ENTRIES  100
+#define STACK_ERR_BUF_SIZE 128
+struct klp_patch *klp_transition_patch;
+static int klp_target_state = KLP_UNDEFINED;
+/*
+ * This work can be performed periodically to finish patching or unpatching any
+ * "straggler" tasks which failed to transition in the first attempt.
+ */
+static void klp_transition_work_fn(struct work_struct *work)
+{
+        mutex_lock(&klp_mutex);
+        if (klp_transition_patch)
+                klp_try_complete_transition();
+        mutex_unlock(&klp_mutex);
+}
+static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
+/*
+ * The transition to the target patch state is complete.  Clean up the data
+ * structures.
+ */
+static void klp_complete_transition(void)
+{
+        struct klp_object *obj;
+        struct klp_func *func;
+        struct task_struct *g, *task;
+        unsigned int cpu;
+        bool immediate_func = false;
+        if (klp_target_state == KLP_UNPATCHED) {
+                /*
+                 * All tasks have transitioned to KLP_UNPATCHED so we can now
+                 * remove the new functions from the func_stack.
+                 */
+                klp_unpatch_objects(klp_transition_patch);
+                /*
+                 * Make sure klp_ftrace_handler() can no longer see functions
+                 * from this patch on the ops->func_stack.  Otherwise, after
+                 * func->transition gets cleared, the handler may choose a
+                 * removed function.
+                 */
+                synchronize_rcu();
+        }
+        if (klp_transition_patch->immediate)
+                goto done;
+        klp_for_each_object(klp_transition_patch, obj) {
+                klp_for_each_func(obj, func) {
+                        func->transition = false;
+                        if (func->immediate)
+                                immediate_func = true;
+                }
+        }
+        if (klp_target_state == KLP_UNPATCHED && !immediate_func)
+                module_put(klp_transition_patch->mod);
+        /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
+        if (klp_target_state == KLP_PATCHED)
+                synchronize_rcu();
+        read_lock(&tasklist_lock);
+        for_each_process_thread(g, task) {
+                WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
+                task->patch_state = KLP_UNDEFINED;
+        }
+        read_unlock(&tasklist_lock);
+        for_each_possible_cpu(cpu) {
+                task = idle_task(cpu);
+                WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
+                task->patch_state = KLP_UNDEFINED;
+        }
+done:
+        klp_target_state = KLP_UNDEFINED;
+        klp_transition_patch = NULL;
+}
+/*
+ * This is called in the error path, to cancel a transition before it has
+ * started, i.e. klp_init_transition() has been called but
+ * klp_start_transition() hasn't.  If the transition *has* been started,
+ * klp_reverse_transition() should be used instead.
+ */
+void klp_cancel_transition(void)
+{
+        if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
+                return;
+        klp_target_state = KLP_UNPATCHED;
+        klp_complete_transition();
+}
+/*
+ * Switch the patched state of the task to the set of functions in the target
+ * patch state.
+ *
+ * NOTE: If task is not 'current', the caller must ensure the task is inactive.
+ * Otherwise klp_ftrace_handler() might read the wrong 'patch_state' value.
+ */
+void klp_update_patch_state(struct task_struct *task)
+{
+        rcu_read_lock();
+        /*
+         * This test_and_clear_tsk_thread_flag() call also serves as a read
+         * barrier (smp_rmb) for two cases:
+         *
+         * 1) Enforce the order of the TIF_PATCH_PENDING read and the
+         *    klp_target_state read.  The corresponding write barrier is in
+         *    klp_init_transition().
+         *
+         * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
+         *    of func->transition, if klp_ftrace_handler() is called later on
+         *    the same CPU.  See __klp_disable_patch().
+         */
+        if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING))
+                task->patch_state = READ_ONCE(klp_target_state);
+        rcu_read_unlock();
+}
+/*
+ * Determine whether the given stack trace includes any references to a
+ * to-be-patched or to-be-unpatched function.
+ */
+static int klp_check_stack_func(struct klp_func *func,
+                                struct stack_trace *trace)
+{
+        unsigned long func_addr, func_size, address;
+        struct klp_ops *ops;
+        int i;
+        if (func->immediate)
+                return 0;
+        for (i = 0; i < trace->nr_entries; i++) {
+                address = trace->entries[i];
+                if (klp_target_state == KLP_UNPATCHED) {
+                         /*
+                          * Check for the to-be-unpatched function
+                          * (the func itself).
+                          */
+                        func_addr = (unsigned long)func->new_func;
+                        func_size = func->new_size;
+                } else {
+                        /*
+                         * Check for the to-be-patched function
+                         * (the previous func).
+                         */
+                        ops = klp_find_ops(func->old_addr);
+                        if (list_is_singular(&ops->func_stack)) {
+                                /* original function */
+                                func_addr = func->old_addr;
+                                func_size = func->old_size;
+                        } else {
+                                /* previously patched function */
+                                struct klp_func *prev;
+                                prev = list_next_entry(func, stack_node);
+                                func_addr = (unsigned long)prev->new_func;
+                                func_size = prev->new_size;
+                        }
+                }
+                if (address >= func_addr && address < func_addr + func_size)
+                        return -EAGAIN;
+        }
+        return 0;
+}
+/*
+ * Determine whether it's safe to transition the task to the target patch state
+ * by looking for any to-be-patched or to-be-unpatched functions on its stack.
+ */
+static int klp_check_stack(struct task_struct *task, char *err_buf)
+{
+        static unsigned long entries[MAX_STACK_ENTRIES];
+        struct stack_trace trace;
+        struct klp_object *obj;
+        struct klp_func *func;
+        int ret;
+        trace.skip = 0;
+        trace.nr_entries = 0;
+        trace.max_entries = MAX_STACK_ENTRIES;
+        trace.entries = entries;
+        ret = save_stack_trace_tsk_reliable(task, &trace);
+        WARN_ON_ONCE(ret == -ENOSYS);
+        if (ret) {
+                snprintf(err_buf, STACK_ERR_BUF_SIZE,
+                         "%s: %s:%d has an unreliable stack\n",
+                         __func__, task->comm, task->pid);
+                return ret;
+        }
+        klp_for_each_object(klp_transition_patch, obj) {
+                if (!obj->patched)
+                        continue;
+                klp_for_each_func(obj, func) {
+                        ret = klp_check_stack_func(func, &trace);
+                        if (ret) {
+                                snprintf(err_buf, STACK_ERR_BUF_SIZE,
+                                         "%s: %s:%d is sleeping on function %s\n",
+                                         __func__, task->comm, task->pid,
+                                         func->old_name);
+                                return ret;
+                        }
+                }
+        }
+        return 0;
+}
+/*
+ * Try to safely switch a task to the target patch state.  If it's currently
+ * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
+ * if the stack is unreliable, return false.
+ */
+static bool klp_try_switch_task(struct task_struct *task)
+{
+        struct rq *rq;
+        struct rq_flags flags;
+        int ret;
+        bool success = false;
+        char err_buf[STACK_ERR_BUF_SIZE];
+        err_buf[0] = '\0';
+        /* check if this task has already switched over */
+        if (task->patch_state == klp_target_state)
+                return true;
+        /*
+         * For arches which don't have reliable stack traces, we have to rely
+         * on other methods (e.g., switching tasks at kernel exit).
+         */
+        if (!klp_have_reliable_stack())
+                return false;
+        /*
+         * Now try to check the stack for any to-be-patched or to-be-unpatched
+         * functions.  If all goes well, switch the task to the target patch
+         * state.
+         */
+        rq = task_rq_lock(task, &flags);
+        if (task_running(rq, task) && task != current) {
+                snprintf(err_buf, STACK_ERR_BUF_SIZE,
+                         "%s: %s:%d is running\n", __func__, task->comm,
+                         task->pid);
+                goto done;
+        }
+        ret = klp_check_stack(task, err_buf);
+        if (ret)
+                goto done;
+        success = true;
+        clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+        task->patch_state = klp_target_state;
+done:
+        task_rq_unlock(rq, task, &flags);
+        /*
+         * Due to console deadlock issues, pr_debug() can't be used while
+         * holding the task rq lock.  Instead we have to use a temporary buffer
+         * and print the debug message after releasing the lock.
+         */
+        if (err_buf[0] != '\0')
+                pr_debug("%s", err_buf);
+        return success;
+}
+/*
+ * Try to switch all remaining tasks to the target patch state by walking the
+ * stacks of sleeping tasks and looking for any to-be-patched or
+ * to-be-unpatched functions.  If such functions are found, the task can't be
+ * switched yet.
+ *
+ * If any tasks are still stuck in the initial patch state, schedule a retry.
+ */
+void klp_try_complete_transition(void)
+{
+        unsigned int cpu;
+        struct task_struct *g, *task;
+        bool complete = true;
+        WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+        /*
+         * If the patch can be applied or reverted immediately, skip the
+         * per-task transitions.
+         */
+        if (klp_transition_patch->immediate)
+                goto success;
+        /*
+         * Try to switch the tasks to the target patch state by walking their
+         * stacks and looking for any to-be-patched or to-be-unpatched
+         * functions.  If such functions are found on a stack, or if the stack
+         * is deemed unreliable, the task can't be switched yet.
+         *
+         * Usually this will transition most (or all) of the tasks on a system
+         * unless the patch includes changes to a very common function.
+         */
+        read_lock(&tasklist_lock);
+        for_each_process_thread(g, task)
+                if (!klp_try_switch_task(task))
+                        complete = false;
+        read_unlock(&tasklist_lock);
+        /*
+         * Ditto for the idle "swapper" tasks.
+         */
+        get_online_cpus();
+        for_each_possible_cpu(cpu) {
+                task = idle_task(cpu);
+                if (cpu_online(cpu)) {
+                        if (!klp_try_switch_task(task))
+                                complete = false;
+                } else if (task->patch_state != klp_target_state) {
+                        /* offline idle tasks can be switched immediately */
+                        clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+                        task->patch_state = klp_target_state;
+                }
+        }
+        put_online_cpus();
+        if (!complete) {
+                /*
+                 * Some tasks weren't able to be switched over.  Try again
+                 * later and/or wait for other methods like kernel exit
+                 * switching.
+                 */
+                schedule_delayed_work(&klp_transition_work,
+                                      round_jiffies_relative(HZ));
+                return;
+        }
+success:
+        pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+                  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+        /* we're done, now cleanup the data structures */
+        klp_complete_transition();
+}
+/*
+ * Start the transition to the specified target patch state so tasks can begin
+ * switching to it.
+ */
+void klp_start_transition(void)
+{
+        struct task_struct *g, *task;
+        unsigned int cpu;
+        WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+        pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+                  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+        /*
+         * If the patch can be applied or reverted immediately, skip the
+         * per-task transitions.
+         */
+        if (klp_transition_patch->immediate)
+                return;
+        /*
+         * Mark all normal tasks as needing a patch state update.  They'll
+         * switch either in klp_try_complete_transition() or as they exit the
+         * kernel.
+         */
+        read_lock(&tasklist_lock);
+        for_each_process_thread(g, task)
+                if (task->patch_state != klp_target_state)
+                        set_tsk_thread_flag(task, TIF_PATCH_PENDING);
+        read_unlock(&tasklist_lock);
+        /*
+         * Mark all idle tasks as needing a patch state update.  They'll switch
+         * either in klp_try_complete_transition() or at the idle loop switch
+         * point.
+         */
+        for_each_possible_cpu(cpu) {
+                task = idle_task(cpu);
+                if (task->patch_state != klp_target_state)
+                        set_tsk_thread_flag(task, TIF_PATCH_PENDING);
+        }
+}
+/*
+ * Initialize the global target patch state and all tasks to the initial patch
+ * state, and initialize all function transition states to true in preparation
+ * for patching or unpatching.
+ */
+void klp_init_transition(struct klp_patch *patch, int state)
+{
+        struct task_struct *g, *task;
+        unsigned int cpu;
+        struct klp_object *obj;
+        struct klp_func *func;
+        int initial_state = !state;
+        WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED);
+        klp_transition_patch = patch;
+        /*
+         * Set the global target patch state which tasks will switch to.  This
+         * has no effect until the TIF_PATCH_PENDING flags get set later.
+         */
+        klp_target_state = state;
+        /*
+         * If the patch can be applied or reverted immediately, skip the
+         * per-task transitions.
+         */
+        if (patch->immediate)
+                return;
+        /*
+         * Initialize all tasks to the initial patch state to prepare them for
+         * switching to the target state.
+         */
+        read_lock(&tasklist_lock);
+        for_each_process_thread(g, task) {
+                WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+                task->patch_state = initial_state;
+        }
+        read_unlock(&tasklist_lock);
+        /*
+         * Ditto for the idle "swapper" tasks.
+         */
+        for_each_possible_cpu(cpu) {
+                task = idle_task(cpu);
+                WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+                task->patch_state = initial_state;
+        }
+        /*
+         * Enforce the order of the task->patch_state initializations and the
+         * func->transition updates to ensure that klp_ftrace_handler() doesn't
+         * see a func in transition with a task->patch_state of KLP_UNDEFINED.
+         *
+         * Also enforce the order of the klp_target_state write and future
+         * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
+         * set a task->patch_state to KLP_UNDEFINED.
+         */
+        smp_wmb();
+        /*
+         * Set the func transition states so klp_ftrace_handler() will know to
+         * switch to the transition logic.
+         *
+         * When patching, the funcs aren't yet in the func_stack and will be
+         * made visible to the ftrace handler shortly by the calls to
+         * klp_patch_object().
+         *
+         * When unpatching, the funcs are already in the func_stack and so are
+         * already visible to the ftrace handler.
+         */
+        klp_for_each_object(patch, obj)
+                klp_for_each_func(obj, func)
+                        func->transition = true;
+}
+/*
+ * This function can be called in the middle of an existing transition to
+ * reverse the direction of the target patch state.  This can be done to
+ * effectively cancel an existing enable or disable operation if there are any
+ * tasks which are stuck in the initial patch state.
+ */
+void klp_reverse_transition(void)
+{
+        unsigned int cpu;
+        struct task_struct *g, *task;
+        klp_transition_patch->enabled = !klp_transition_patch->enabled;
+        klp_target_state = !klp_target_state;
+        /*
+         * Clear all TIF_PATCH_PENDING flags to prevent races caused by
+         * klp_update_patch_state() running in parallel with
+         * klp_start_transition().
+         */
+        read_lock(&tasklist_lock);
+        for_each_process_thread(g, task)
+                clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+        read_unlock(&tasklist_lock);
+        for_each_possible_cpu(cpu)
+                clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
+        /* Let any remaining calls to klp_update_patch_state() complete */
+        synchronize_rcu();
+        klp_start_transition();
+}
+/* Called from copy_process() during fork */
+void klp_copy_process(struct task_struct *child)
+{
+        child->patch_state = current->patch_state;
+        /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
+}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
new file mode 100644
index 000000000000..ce09b326546c
--- /dev/null
+++ b/kernel/livepatch/transition.h
@@ -0,0 +1,14 @@
+#ifndef _LIVEPATCH_TRANSITION_H
+#define _LIVEPATCH_TRANSITION_H
+#include <linux/livepatch.h>
+extern struct klp_patch *klp_transition_patch;
+void klp_init_transition(struct klp_patch *patch, int state);
+void klp_cancel_transition(void);
+void klp_start_transition(void);
+void klp_try_complete_transition(void);
+void klp_reverse_transition(void);
+#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a95e5d1f4a9c..c0e31bfee25c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/task.h>
+#include <linux/sched/mm.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
@@ -660,6 +661,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        struct lockdep_subclass_key *key;
        struct hlist_head *hash_head;
        struct lock_class *class;
+        bool is_static = false;
        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
                debug_locks_off();
@@ -673,10 +675,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        /*
         * Static locks do not have their class-keys yet - for them the key
-         * is the lock object itself:
+         * is the lock object itself. If the lock is in the per cpu area,
+         * the canonical address of the lock (per cpu offset removed) is
+         * used.
         */
-        if (unlikely(!lock->key))
+        if (unlikely(!lock->key)) {
-                lock->key = (void *)lock;
+                unsigned long can_addr, addr = (unsigned long)lock;
+                if (__is_kernel_percpu_address(addr, &can_addr))
+                        lock->key = (void *)can_addr;
+                else if (__is_module_percpu_address(addr, &can_addr))
+                        lock->key = (void *)can_addr;
+                else if (static_obj(lock))
+                        lock->key = (void *)lock;
+                else
+                        return ERR_PTR(-EINVAL);
+                is_static = true;
+        }
        /*
         * NOTE: the class-key must be unique. For dynamic locks, a static
@@ -708,7 +723,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
                }
        }
-        return NULL;
+        return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
 /*
@@ -726,19 +741,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        DEBUG_LOCKS_WARN_ON(!irqs_disabled());
        class = look_up_lock_class(lock, subclass);
-        if (likely(class))
+        if (likely(!IS_ERR_OR_NULL(class)))
                goto out_set_class_cache;
        /*
         * Debug-check: all keys must be persistent!
-         */
+         */
-        if (!static_obj(lock->key)) {
+        if (IS_ERR(class)) {
                debug_locks_off();
                printk("INFO: trying to register non-static key.\n");
                printk("the code is fine but needs lockdep annotation.\n");
                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return NULL;
        }
@@ -1144,10 +1158,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
                return 0;
        printk("\n");
-        printk("======================================================\n");
+        pr_warn("======================================================\n");
-        printk("[ INFO: possible circular locking dependency detected ]\n");
+        pr_warn("WARNING: possible circular locking dependency detected\n");
        print_kernel_ident();
-        printk("-------------------------------------------------------\n");
+        pr_warn("------------------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(check_src);
@@ -1482,11 +1496,11 @@ print_bad_irq_dependency(struct task_struct *curr,
                return 0;
        printk("\n");
-        printk("======================================================\n");
+        pr_warn("=====================================================\n");
-        printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+        pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
                irqclass, irqclass);
        print_kernel_ident();
-        printk("------------------------------------------------------\n");
+        pr_warn("-----------------------------------------------------\n");
        printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
                curr->comm, task_pid_nr(curr),
                curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1711,10 +1725,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
                return 0;
        printk("\n");
-        printk("=============================================\n");
+        pr_warn("============================================\n");
-        printk("[ INFO: possible recursive locking detected ]\n");
+        pr_warn("WARNING: possible recursive locking detected\n");
        print_kernel_ident();
-        printk("---------------------------------------------\n");
+        pr_warn("--------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(next);
@@ -2061,10 +2075,10 @@ static void print_collision(struct task_struct *curr,
                        struct lock_chain *chain)
 {
        printk("\n");
-        printk("======================\n");
+        pr_warn("============================\n");
-        printk("[chain_key collision ]\n");
+        pr_warn("WARNING: chain_key collision\n");
        print_kernel_ident();
-        printk("----------------------\n");
+        pr_warn("----------------------------\n");
        printk("%s/%d: ", current->comm, task_pid_nr(current));
        printk("Hash chain already cached but the contents don't match!\n");
@@ -2360,10 +2374,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
                return 0;
        printk("\n");
-        printk("=================================\n");
+        pr_warn("================================\n");
-        printk("[ INFO: inconsistent lock state ]\n");
+        pr_warn("WARNING: inconsistent lock state\n");
        print_kernel_ident();
-        printk("---------------------------------\n");
+        pr_warn("--------------------------------\n");
        printk("inconsistent {%s} -> {%s} usage.\n",
                usage_str[prev_bit], usage_str[new_bit]);
@@ -2425,10 +2439,10 @@ print_irq_inversion_bug(struct task_struct *curr,
                return 0;
        printk("\n");
-        printk("=========================================================\n");
+        pr_warn("========================================================\n");
-        printk("[ INFO: possible irq lock inversion dependency detected ]\n");
+        pr_warn("WARNING: possible irq lock inversion dependency detected\n");
        print_kernel_ident();
-        printk("---------------------------------------------------------\n");
+        pr_warn("--------------------------------------------------------\n");
        printk("%s/%d just changed the state of lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(this);
@@ -2863,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
        if (unlikely(!debug_locks))
                return;
+        gfp_mask = current_gfp_context(gfp_mask);
        /* no reclaim without waiting on it */
        if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
                return;
@@ -2872,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
                return;
        /* We're only interested __GFP_FS allocations for now */
-        if (!(gfp_mask & __GFP_FS))
+        if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
                return;
        /*
@@ -2881,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
        if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
                return;
+        /* Disable lockdep if explicitly requested */
+        if (gfp_mask & __GFP_NOLOCKDEP)
+                return;
        mark_held_locks(curr, RECLAIM_FS);
 }
@@ -3170,10 +3190,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
                return 0;
        printk("\n");
-        printk("==================================\n");
+        pr_warn("==================================\n");
-        printk("[ BUG: Nested lock was not taken ]\n");
+        pr_warn("WARNING: Nested lock was not taken\n");
        print_kernel_ident();
-        printk("----------------------------------\n");
+        pr_warn("----------------------------------\n");
        printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
        print_lock(hlock);
@@ -3383,10 +3403,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
                return 0;
        printk("\n");
-        printk("=====================================\n");
+        pr_warn("=====================================\n");
-        printk("[ BUG: bad unlock balance detected! ]\n");
+        pr_warn("WARNING: bad unlock balance detected!\n");
        print_kernel_ident();
-        printk("-------------------------------------\n");
+        pr_warn("-------------------------------------\n");
        printk("%s/%d is trying to release lock (",
                curr->comm, task_pid_nr(curr));
        print_lockdep_cache(lock);
@@ -3419,7 +3439,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                 * Clearly if the lock hasn't been acquired _ever_, we're not
                 * holding it either, so report failure.
                 */
-                if (!class)
+                if (IS_ERR_OR_NULL(class))
                        return 0;
                /*
@@ -3437,13 +3457,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
        return 0;
 }
+/* @depth must not be zero */
+static struct held_lock *find_held_lock(struct task_struct *curr,
+                                        struct lockdep_map *lock,
+                                        unsigned int depth, int *idx)
+{
+        struct held_lock *ret, *hlock, *prev_hlock;
+        int i;
+        i = depth - 1;
+        hlock = curr->held_locks + i;
+        ret = hlock;
+        if (match_held_lock(hlock, lock))
+                goto out;
+        ret = NULL;
+        for (i--, prev_hlock = hlock--;
+             i >= 0;
+             i--, prev_hlock = hlock--) {
+                /*
+                 * We must not cross into another context:
+                 */
+                if (prev_hlock->irq_context != hlock->irq_context) {
+                        ret = NULL;
+                        break;
+                }
+                if (match_held_lock(hlock, lock)) {
+                        ret = hlock;
+                        break;
+                }
+        }
+out:
+        *idx = i;
+        return ret;
+}
+static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
+                              int idx)
+{
+        struct held_lock *hlock;
+        for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
+                if (!__lock_acquire(hlock->instance,
+                                    hlock_class(hlock)->subclass,
+                                    hlock->trylock,
+                                    hlock->read, hlock->check,
+                                    hlock->hardirqs_off,
+                                    hlock->nest_lock, hlock->acquire_ip,
+                                    hlock->references, hlock->pin_count))
+                        return 1;
+        }
+        return 0;
+}
 static int
 __lock_set_class(struct lockdep_map *lock, const char *name,
                 struct lock_class_key *key, unsigned int subclass,
                 unsigned long ip)
 {
        struct task_struct *curr = current;
-        struct held_lock *hlock, *prev_hlock;
+        struct held_lock *hlock;
        struct lock_class *class;
        unsigned int depth;
        int i;
@@ -3456,21 +3530,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return 0;
-        prev_hlock = NULL;
+        hlock = find_held_lock(curr, lock, depth, &i);
-        for (i = depth-1; i >= 0; i--) {
+        if (!hlock)
-                hlock = curr->held_locks + i;
+                return print_unlock_imbalance_bug(curr, lock, ip);
-                /*
-                 * We must not cross into another context:
-                 */
-                if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
-                        break;
-                if (match_held_lock(hlock, lock))
-                        goto found_it;
-                prev_hlock = hlock;
-        }
-        return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
        lockdep_init_map(lock, name, key, 0);
        class = register_lock_class(lock, subclass, 0);
        hlock->class_idx = class - lock_classes + 1;
@@ -3478,15 +3541,46 @@ found_it:
        curr->lockdep_depth = i;
        curr->curr_chain_key = hlock->prev_chain_key;
-        for (; i < depth; i++) {
+        if (reacquire_held_locks(curr, depth, i))
-                hlock = curr->held_locks + i;
+                return 0;
-                if (!__lock_acquire(hlock->instance,
-                        hlock_class(hlock)->subclass, hlock->trylock,
+        /*
-                                hlock->read, hlock->check, hlock->hardirqs_off,
+         * I took it apart and put it back together again, except now I have
-                                hlock->nest_lock, hlock->acquire_ip,
+         * these 'spare' parts.. where shall I put them.
-                                hlock->references, hlock->pin_count))
+         */
-                        return 0;
+        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
-        }
+                return 0;
+        return 1;
+}
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+        struct task_struct *curr = current;
+        struct held_lock *hlock;
+        unsigned int depth;
+        int i;
+        depth = curr->lockdep_depth;
+        /*
+         * This function is about (re)setting the class of a held lock,
+         * yet we're not actually holding any locks. Naughty user!
+         */
+        if (DEBUG_LOCKS_WARN_ON(!depth))
+                return 0;
+        hlock = find_held_lock(curr, lock, depth, &i);
+        if (!hlock)
+                return print_unlock_imbalance_bug(curr, lock, ip);
+        curr->lockdep_depth = i;
+        curr->curr_chain_key = hlock->prev_chain_key;
+        WARN(hlock->read, "downgrading a read lock");
+        hlock->read = 1;
+        hlock->acquire_ip = ip;
+        if (reacquire_held_locks(curr, depth, i))
+                return 0;
        /*
         * I took it apart and put it back together again, except now I have
@@ -3508,7 +3602,7 @@ static int
 __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
        struct task_struct *curr = current;
-        struct held_lock *hlock, *prev_hlock;
+        struct held_lock *hlock;
        unsigned int depth;
        int i;
@@ -3527,21 +3621,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
         * Check whether the lock exists in the current stack
         * of held locks:
         */
-        prev_hlock = NULL;
+        hlock = find_held_lock(curr, lock, depth, &i);
-        for (i = depth-1; i >= 0; i--) {
+        if (!hlock)
-                hlock = curr->held_locks + i;
+                return print_unlock_imbalance_bug(curr, lock, ip);
-                /*
-                 * We must not cross into another context:
-                 */
-                if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
-                        break;
-                if (match_held_lock(hlock, lock))
-                        goto found_it;
-                prev_hlock = hlock;
-        }
-        return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
        if (hlock->instance == lock)
                lock_release_holdtime(hlock);
@@ -3568,15 +3651,8 @@ found_it:
        curr->lockdep_depth = i;
        curr->curr_chain_key = hlock->prev_chain_key;
-        for (i++; i < depth; i++) {
+        if (reacquire_held_locks(curr, depth, i + 1))
-                hlock = curr->held_locks + i;
+                return 0;
-                if (!__lock_acquire(hlock->instance,
-                        hlock_class(hlock)->subclass, hlock->trylock,
-                                hlock->read, hlock->check, hlock->hardirqs_off,
-                                hlock->nest_lock, hlock->acquire_ip,
-                                hlock->references, hlock->pin_count))
-                        return 0;
-        }
        /*
         * We had N bottles of beer on the wall, we drank one, but now
@@ -3741,6 +3817,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+        unsigned long flags;
+        if (unlikely(current->lockdep_recursion))
+                return;
+        raw_local_irq_save(flags);
+        current->lockdep_recursion = 1;
+        check_flags(flags);
+        if (__lock_downgrade(lock, ip))
+                check_chain_key(current);
+        current->lockdep_recursion = 0;
+        raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
 /*
 * We are not always called with irqs disabled - do that here,
 * and also avoid lockdep recursion:
@@ -3861,13 +3954,15 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
-        current->lockdep_reclaim_gfp = gfp_mask;
+        current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
 }
+EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
 void lockdep_clear_current_reclaim_state(void)
 {
        current->lockdep_reclaim_gfp = 0;
 }
+EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
 #ifdef CONFIG_LOCK_STAT
 static int
@@ -3880,10 +3975,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
                return 0;
        printk("\n");
-        printk("=================================\n");
+        pr_warn("=================================\n");
-        printk("[ BUG: bad contention detected! ]\n");
+        pr_warn("WARNING: bad contention detected!\n");
        print_kernel_ident();
-        printk("---------------------------------\n");
+        pr_warn("---------------------------------\n");
        printk("%s/%d is trying to contend lock (",
                curr->comm, task_pid_nr(curr));
        print_lockdep_cache(lock);
@@ -3903,7 +3998,7 @@ static void
 __lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
        struct task_struct *curr = current;
-        struct held_lock *hlock, *prev_hlock;
+        struct held_lock *hlock;
        struct lock_class_stats *stats;
        unsigned int depth;
        int i, contention_point, contending_point;
@@ -3916,22 +4011,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
-        prev_hlock = NULL;
+        hlock = find_held_lock(curr, lock, depth, &i);
-        for (i = depth-1; i >= 0; i--) {
+        if (!hlock) {
-                hlock = curr->held_locks + i;
+                print_lock_contention_bug(curr, lock, ip);
-                /*
+                return;
-                 * We must not cross into another context:
-                 */
-                if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
-                        break;
-                if (match_held_lock(hlock, lock))
-                        goto found_it;
-                prev_hlock = hlock;
        }
-        print_lock_contention_bug(curr, lock, ip);
-        return;
-found_it:
        if (hlock->instance != lock)
                return;
@@ -3955,7 +4040,7 @@ static void
 __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
        struct task_struct *curr = current;
-        struct held_lock *hlock, *prev_hlock;
+        struct held_lock *hlock;
        struct lock_class_stats *stats;
        unsigned int depth;
        u64 now, waittime = 0;
@@ -3969,22 +4054,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
-        prev_hlock = NULL;
+        hlock = find_held_lock(curr, lock, depth, &i);
-        for (i = depth-1; i >= 0; i--) {
+        if (!hlock) {
-                hlock = curr->held_locks + i;
+                print_lock_contention_bug(curr, lock, _RET_IP_);
-                /*
+                return;
-                 * We must not cross into another context:
-                 */
-                if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
-                        break;
-                if (match_held_lock(hlock, lock))
-                        goto found_it;
-                prev_hlock = hlock;
        }
-        print_lock_contention_bug(curr, lock, _RET_IP_);
-        return;
-found_it:
        if (hlock->instance != lock)
                return;
@@ -4172,7 +4247,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                 * If the class exists we look it up and zap it:
                 */
                class = look_up_lock_class(lock, j);
-                if (class)
+                if (!IS_ERR_OR_NULL(class))
                        zap_class(class);
        }
        /*
@@ -4244,10 +4319,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
                return;
        printk("\n");
-        printk("=========================\n");
+        pr_warn("=========================\n");
-        printk("[ BUG: held lock freed! ]\n");
+        pr_warn("WARNING: held lock freed!\n");
        print_kernel_ident();
-        printk("-------------------------\n");
+        pr_warn("-------------------------\n");
        printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
                curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
        print_lock(hlock);
@@ -4302,11 +4377,11 @@ static void print_held_locks_bug(void)
                return;
        printk("\n");
-        printk("=====================================\n");
+        pr_warn("====================================\n");
-        printk("[ BUG: %s/%d still has locks held! ]\n",
+        pr_warn("WARNING: %s/%d still has locks held!\n",
               current->comm, task_pid_nr(current));
        print_kernel_ident();
-        printk("-------------------------------------\n");
+        pr_warn("------------------------------------\n");
        lockdep_print_held_locks(current);
        printk("\nstack backtrace:\n");
        dump_stack();
@@ -4371,7 +4446,7 @@ retry:
        } while_each_thread(g, p);
        printk("\n");
-        printk("=============================================\n\n");
+        pr_warn("=============================================\n\n");
        if (unlock)
                read_unlock(&tasklist_lock);
@@ -4401,10 +4476,10 @@ asmlinkage __visible void lockdep_sys_exit(void)
                if (!debug_locks_off())
                        return;
                printk("\n");
-                printk("================================================\n");
+                pr_warn("================================================\n");
-                printk("[ BUG: lock held when returning to user space! ]\n");
+                pr_warn("WARNING: lock held when returning to user space!\n");
                print_kernel_ident();
-                printk("------------------------------------------------\n");
+                pr_warn("------------------------------------------------\n");
                printk("%s/%d is leaving the kernel with locks still held!\n",
                                curr->comm, curr->pid);
                lockdep_print_held_locks(curr);
@@ -4421,13 +4496,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
        /* Note: the following can be executed concurrently, so be careful. */
        printk("\n");
-        pr_err("===============================\n");
+        pr_warn("=============================\n");
-        pr_err("[ ERR: suspicious RCU usage.  ]\n");
+        pr_warn("WARNING: suspicious RCU usage\n");
        print_kernel_ident();
-        pr_err("-------------------------------\n");
+        pr_warn("-----------------------------\n");
-        pr_err("%s:%d %s!\n", file, line, s);
+        printk("%s:%d %s!\n", file, line, s);
-        pr_err("\nother info that might help us debug this:\n\n");
+        printk("\nother info that might help us debug this:\n\n");
-        pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+        printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
               !rcu_lockdep_current_cpu_online()
                        ? "RCU used illegally from offline CPU!\n"
                        : !rcu_is_watching()
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 97ee9df32e0f..58e366ad36f4 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
                return;
        }
-        printk("\n============================================\n");
+        pr_warn("\n");
-        printk(  "[ BUG: circular locking deadlock detected! ]\n");
+        pr_warn("============================================\n");
-        printk("%s\n", print_tainted());
+        pr_warn("WARNING: circular locking deadlock detected!\n");
-        printk(  "--------------------------------------------\n");
+        pr_warn("%s\n", print_tainted());
+        pr_warn("--------------------------------------------\n");
        printk("%s/%d is deadlocking current task %s/%d\n\n",
               task->comm, task_pid_nr(task),
               current->comm, task_pid_nr(current));
@@ -174,12 +175,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
        lock->name = name;
 }
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index d0519c3432b6..b585af9a1b50 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -9,9 +9,6 @@
 * This file contains macros used solely by rtmutex.c. Debug version.
 */
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
 extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6edc32ecd9c5..b95509416909 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
 }
 #endif
+/*
+ * Only use with rt_mutex_waiter_{less,equal}()
+ */
+#define task_to_waiter(p)       \
+        &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
 static inline int
 rt_mutex_waiter_less(struct rt_mutex_waiter *left,
                     struct rt_mutex_waiter *right)
@@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
         * then right waiter has a dl_prio() too.
         */
        if (dl_prio(left->prio))
-                return dl_time_before(left->task->dl.deadline,
+                return dl_time_before(left->deadline, right->deadline);
-                                      right->task->dl.deadline);
        return 0;
 }
+static inline int
+rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+                      struct rt_mutex_waiter *right)
+{
+        if (left->prio != right->prio)
+                return 0;
+        /*
+         * If both waiters have dl_prio(), we check the deadlines of the
+         * associated tasks.
+         * If left waiter has a dl_prio(), and we didn't return 0 above,
+         * then right waiter has a dl_prio() too.
+         */
+        if (dl_prio(left->prio))
+                return left->deadline == right->deadline;
+        return 1;
+}
 static void
 rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
 {
@@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
        RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
-/*
+static void rt_mutex_adjust_prio(struct task_struct *p)
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
-        if (likely(!task_has_pi_waiters(task)))
-                return task->normal_prio;
-        return min(task_top_pi_waiter(task)->prio,
-                   task->normal_prio);
-}
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
-        if (likely(!task_has_pi_waiters(task)))
+        struct task_struct *pi_task = NULL;
-                return NULL;
-        return task_top_pi_waiter(task)->task;
-}
-/*
+        lockdep_assert_held(&p->pi_lock);
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
-        if (!task_has_pi_waiters(task))
-                return newprio;
-        if (task_top_pi_waiter(task)->task->prio <= newprio)
+        if (task_has_pi_waiters(p))
-                return task_top_pi_waiter(task)->task->prio;
+                pi_task = task_top_pi_waiter(p)->task;
-        return newprio;
-}
-/*
+        rt_mutex_setprio(p, pi_task);
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
-        int prio = rt_mutex_getprio(task);
-        if (task->prio != prio || dl_prio(prio))
-                rt_mutex_setprio(task, prio);
-}
-/*
- * Adjust task priority (undo boosting). Called from the exit path of
- * rt_mutex_slowunlock() and rt_mutex_slowlock().
- *
- * (Note: We do this outside of the protection of lock->wait_lock to
- * allow the lock to be taken while or before we readjust the priority
- * of task. We do not use the spin_xx_mutex() variants here as we are
- * outside of the debug path.)
- */
-void rt_mutex_adjust_prio(struct task_struct *task)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&task->pi_lock, flags);
-        __rt_mutex_adjust_prio(task);
-        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 }
 /*
@@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * enabled we continue, but stop the requeueing in the chain
         * walk.
         */
-        if (waiter->prio == task->prio) {
+        if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
                if (!detect_deadlock)
                        goto out_unlock_pi;
                else
@@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /* [7] Requeue the waiter in the lock waiter tree. */
        rt_mutex_dequeue(lock, waiter);
+        /*
+         * Update the waiter prio fields now that we're dequeued.
+         *
+         * These values can have changed through either:
+         *
+         *   sys_sched_set_scheduler() / sys_sched_setattr()
+         *
+         * or
+         *
+         *   DL CBS enforcement advancing the effective deadline.
+         *
+         * Even though pi_waiters also uses these fields, and that tree is only
+         * updated in [11], we can do this here, since we hold [L], which
+         * serializes all pi_waiters access and rb_erase() does not care about
+         * the values of the node being removed.
+         */
        waiter->prio = task->prio;
+        waiter->deadline = task->dl.deadline;
        rt_mutex_enqueue(lock, waiter);
        /* [8] Release the task */
@@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                 */
                rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
                rt_mutex_enqueue_pi(task, waiter);
-                __rt_mutex_adjust_prio(task);
+                rt_mutex_adjust_prio(task);
        } else if (prerequeue_top_waiter == waiter) {
                /*
@@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                rt_mutex_dequeue_pi(task, waiter);
                waiter = rt_mutex_top_waiter(lock);
                rt_mutex_enqueue_pi(task, waiter);
-                __rt_mutex_adjust_prio(task);
+                rt_mutex_adjust_prio(task);
        } else {
                /*
                 * Nothing changed. No need to do any priority
@@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
                                struct rt_mutex_waiter *waiter)
 {
+        lockdep_assert_held(&lock->wait_lock);
        /*
         * Before testing whether we can acquire @lock, we set the
         * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
                         * the top waiter priority (kernel view),
                         * @task lost.
                         */
-                        if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+                        if (!rt_mutex_waiter_less(task_to_waiter(task),
+                                                  rt_mutex_top_waiter(lock)))
                                return 0;
                        /*
@@ -938,8 +928,6 @@ takeit:
         */
        rt_mutex_set_owner(lock, task);
-        rt_mutex_deadlock_account_lock(lock, task);
        return 1;
 }
@@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        struct rt_mutex *next_lock;
        int chain_walk = 0, res;
+        lockdep_assert_held(&lock->wait_lock);
        /*
         * Early deadlock detection. We really don't want the task to
         * enqueue on itself just to untangle the mess later. It's not
@@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                return -EDEADLK;
        raw_spin_lock(&task->pi_lock);
-        __rt_mutex_adjust_prio(task);
+        rt_mutex_adjust_prio(task);
        waiter->task = task;
        waiter->lock = lock;
        waiter->prio = task->prio;
+        waiter->deadline = task->dl.deadline;
        /* Get the top priority waiter on the lock */
        if (rt_mutex_has_waiters(lock))
@@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                rt_mutex_dequeue_pi(owner, top_waiter);
                rt_mutex_enqueue_pi(owner, waiter);
-                __rt_mutex_adjust_prio(owner);
+                rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
                        chain_walk = 1;
        } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
        waiter = rt_mutex_top_waiter(lock);
        /*
-         * Remove it from current->pi_waiters. We do not adjust a
+         * Remove it from current->pi_waiters and deboost.
-         * possible priority boost right now. We execute wakeup in the
+         *
-         * boosted mode and go back to normal after releasing
+         * We must in fact deboost here in order to ensure we call
-         * lock->wait_lock.
+         * rt_mutex_setprio() to update p->pi_top_task before the
+         * task unblocks.
         */
        rt_mutex_dequeue_pi(current, waiter);
+        rt_mutex_adjust_prio(current);
        /*
         * As we are waking up the top waiter, and the waiter stays
@@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
         */
        lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
-        raw_spin_unlock(&current->pi_lock);
+        /*
+         * We deboosted before waking the top waiter task such that we don't
+         * run two tasks with the 'same' priority (and ensure the
+         * p->pi_top_task pointer points to a blocked task). This however can
+         * lead to priority inversion if we would get preempted after the
+         * deboost but before waking our donor task, hence the preempt_disable()
+         * before unlock.
+         *
+         * Pairs with preempt_enable() in rt_mutex_postunlock();
+         */
+        preempt_disable();
        wake_q_add(wake_q, waiter->task);
+        raw_spin_unlock(&current->pi_lock);
 }
 /*
@@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock,
        struct task_struct *owner = rt_mutex_owner(lock);
        struct rt_mutex *next_lock;
+        lockdep_assert_held(&lock->wait_lock);
        raw_spin_lock(&current->pi_lock);
        rt_mutex_dequeue(lock, waiter);
        current->pi_blocked_on = NULL;
@@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock,
        if (rt_mutex_has_waiters(lock))
                rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-        __rt_mutex_adjust_prio(owner);
+        rt_mutex_adjust_prio(owner);
        /* Store the lock on which owner is blocked or NULL */
        next_lock = task_blocked_on_lock(owner);
@@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        waiter = task->pi_blocked_on;
-        if (!waiter || (waiter->prio == task->prio &&
+        if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
-                        !dl_prio(task->prio))) {
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                return;
        }
@@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task)
                                   next_lock, NULL, task);
 }
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+        debug_rt_mutex_init_waiter(waiter);
+        RB_CLEAR_NODE(&waiter->pi_tree_entry);
+        RB_CLEAR_NODE(&waiter->tree_entry);
+        waiter->task = NULL;
+}
 /**
 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
 * @lock:                the rt_mutex to take
@@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        unsigned long flags;
        int ret = 0;
-        debug_rt_mutex_init_waiter(&waiter);
+        rt_mutex_init_waiter(&waiter);
-        RB_CLEAR_NODE(&waiter.pi_tree_entry);
-        RB_CLEAR_NODE(&waiter.tree_entry);
        /*
         * Technically we could use raw_spin_[un]lock_irq() here, but this can
@@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 /*
 * Slow path to release a rt-mutex.
- * Return whether the current task needs to undo a potential priority boosting.
+ *
+ * Return whether the current task needs to call rt_mutex_postunlock().
 */
 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
                                        struct wake_q_head *wake_q)
@@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
        debug_rt_mutex_unlock(lock);
-        rt_mutex_deadlock_account_unlock(current);
        /*
         * We must be careful here if the fast path is enabled. If we
         * have no waiters queued we cannot set owner to NULL here
@@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
         * Queue the next waiter for wakeup once we release the wait_lock.
         */
        mark_wakeup_next_waiter(wake_q, lock);
        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-        /* check PI boosting */
+        return true; /* call rt_mutex_postunlock() */
-        return true;
 }
 /*
@@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
                                struct hrtimer_sleeper *timeout,
                                enum rtmutex_chainwalk chwalk))
 {
-        if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
+        if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
-                rt_mutex_deadlock_account_lock(lock, current);
                return 0;
-        } else
-                return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+        return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
 }
 static inline int
@@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
                                      enum rtmutex_chainwalk chwalk))
 {
        if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
-            likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
+            likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
-                rt_mutex_deadlock_account_lock(lock, current);
                return 0;
-        } else
-                return slowfn(lock, state, timeout, chwalk);
+        return slowfn(lock, state, timeout, chwalk);
 }
 static inline int
 rt_mutex_fasttrylock(struct rt_mutex *lock,
                     int (*slowfn)(struct rt_mutex *lock))
 {
-        if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
+        if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
-                rt_mutex_deadlock_account_lock(lock, current);
                return 1;
-        }
        return slowfn(lock);
 }
+/*
+ * Performs the wakeup of the the top-waiter and re-enables preemption.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q)
+{
+        wake_up_q(wake_q);
+        /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
+        preempt_enable();
+}
 static inline void
 rt_mutex_fastunlock(struct rt_mutex *lock,
                    bool (*slowfn)(struct rt_mutex *lock,
@@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 {
        DEFINE_WAKE_Q(wake_q);
-        if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
+        if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
-                rt_mutex_deadlock_account_unlock(current);
+                return;
-        } else {
-                bool deboost = slowfn(lock, &wake_q);
-                wake_up_q(&wake_q);
-                /* Undo pi boosting if necessary: */
+        if (slowfn(lock, &wake_q))
-                if (deboost)
+                rt_mutex_postunlock(&wake_q);
-                        rt_mutex_adjust_prio(current);
-        }
 }
 /**
@@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 /*
- * Futex variant with full deadlock detection.
+ * Futex variant, must not use fastpath.
 */
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
-                              struct hrtimer_sleeper *timeout)
 {
-        might_sleep();
+        return rt_mutex_slowtrylock(lock);
-        return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
-                                       RT_MUTEX_FULL_CHAINWALK,
-                                       rt_mutex_slowlock);
 }
 /**
@@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 /**
- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
+ * Futex variant, that since futex variants do not use the fast-path, can be
- * @lock: the rt_mutex to be unlocked
+ * simple and will not need to retry.
- *
- * Returns: true/false indicating whether priority adjustment is
- * required or not.
 */
-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
-                                   struct wake_q_head *wqh)
+                                    struct wake_q_head *wake_q)
 {
-        if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
+        lockdep_assert_held(&lock->wait_lock);
-                rt_mutex_deadlock_account_unlock(current);
-                return false;
+        debug_rt_mutex_unlock(lock);
+        if (!rt_mutex_has_waiters(lock)) {
+                lock->owner = NULL;
+                return false; /* done */
        }
-        return rt_mutex_slowunlock(lock, wqh);
+        /*
+         * We've already deboosted, mark_wakeup_next_waiter() will
+         * retain preempt_disabled when we drop the wait_lock, to
+         * avoid inversion prior to the wakeup.  preempt_disable()
+         * therein pairs with rt_mutex_postunlock().
+         */
+        mark_wakeup_next_waiter(wake_q, lock);
+        return true; /* call postunlock() */
+}
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+        DEFINE_WAKE_Q(wake_q);
+        bool postunlock;
+        raw_spin_lock_irq(&lock->wait_lock);
+        postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
+        raw_spin_unlock_irq(&lock->wait_lock);
+        if (postunlock)
+                rt_mutex_postunlock(&wake_q);
 }
 /**
@@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
        __rt_mutex_init(lock, NULL);
        debug_rt_mutex_proxy_lock(lock, proxy_owner);
        rt_mutex_set_owner(lock, proxy_owner);
-        rt_mutex_deadlock_account_lock(lock, proxy_owner);
 }
 /**
@@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 {
        debug_rt_mutex_proxy_unlock(lock);
        rt_mutex_set_owner(lock, NULL);
-        rt_mutex_deadlock_account_unlock(proxy_owner);
 }
-/**
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock:               the rt_mutex to take
- * @waiter:             the pre-initialized rt_mutex_waiter
- * @task:               the task to prepare
- *
- * Returns:
- *  0 - task blocked on lock
- *  1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                              struct rt_mutex_waiter *waiter,
                              struct task_struct *task)
 {
        int ret;
-        raw_spin_lock_irq(&lock->wait_lock);
+        if (try_to_take_rt_mutex(lock, task, NULL))
-        if (try_to_take_rt_mutex(lock, task, NULL)) {
-                raw_spin_unlock_irq(&lock->wait_lock);
                return 1;
-        }
        /* We enforce deadlock detection for futexes */
        ret = task_blocks_on_rt_mutex(lock, waiter, task,
@@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
        if (unlikely(ret))
                remove_waiter(lock, waiter);
-        raw_spin_unlock_irq(&lock->wait_lock);
        debug_rt_mutex_print_deadlock(waiter);
        return ret;
 }
 /**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:               the rt_mutex to take
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ * @task:               the task to prepare
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                              struct rt_mutex_waiter *waiter,
+                              struct task_struct *task)
+{
+        int ret;
+        raw_spin_lock_irq(&lock->wait_lock);
+        ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+        raw_spin_unlock_irq(&lock->wait_lock);
+        return ret;
+}
+/**
 * rt_mutex_next_owner - return the next owner of the lock
 *
 * @lock: the rt lock query
@@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
 }
 /**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
 * @lock:               the rt_mutex we were woken on
 * @to:                 the timeout, null if none. hrtimer should already have
 *                      been started.
 * @waiter:             the pre-initialized rt_mutex_waiter
 *
- * Complete the lock acquisition started our behalf by another thread.
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
 *
 * Returns:
 *  0 - success
 * <0 - error, one of -EINTR, -ETIMEDOUT
 *
- * Special API call for PI-futex requeue support
+ * Special API call for PI-futex support
 */
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
                               struct hrtimer_sleeper *to,
                               struct rt_mutex_waiter *waiter)
 {
@@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        /* sleep on the mutex */
        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-        if (unlikely(ret))
+        raw_spin_unlock_irq(&lock->wait_lock);
+        return ret;
+}
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock:               the rt_mutex we were woken on
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ *  true  - did the cleanup, we done.
+ *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ *          caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+                                 struct rt_mutex_waiter *waiter)
+{
+        bool cleanup = false;
+        raw_spin_lock_irq(&lock->wait_lock);
+        /*
+         * Unless we're the owner; we're still enqueued on the wait_list.
+         * So check if we became owner, if not, take us off the wait_list.
+         */
+        if (rt_mutex_owner(lock) != current) {
                remove_waiter(lock, waiter);
+                fixup_rt_mutex_waiters(lock);
+                cleanup = true;
+        }
        /*
         * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
@@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        raw_spin_unlock_irq(&lock->wait_lock);
-        return ret;
+        return cleanup;
 }
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index c4060584c407..6607802efa8b 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,8 +11,6 @@
 */
 #define rt_mutex_deadlock_check(l)                      (0)
-#define rt_mutex_deadlock_account_lock(m, t)            do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l)             do { } while (0)
 #define debug_rt_mutex_init_waiter(w)                   do { } while (0)
 #define debug_rt_mutex_free_waiter(w)                   do { } while (0)
 #define debug_rt_mutex_lock(l)                          do { } while (0)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 856dfff5c33a..72ad45a9a794 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -34,6 +34,7 @@ struct rt_mutex_waiter {
        struct rt_mutex         *deadlock_lock;
 #endif
        int prio;
+        u64 deadline;
 };
 /*
@@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                  struct task_struct *proxy_owner);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                     struct rt_mutex_waiter *waiter,
+                                     struct task_struct *task);
 extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                                     struct rt_mutex_waiter *waiter,
                                     struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
-                                      struct hrtimer_sleeper *to,
+                               struct hrtimer_sleeper *to,
-                                      struct rt_mutex_waiter *waiter);
+                               struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
+                                 struct rt_mutex_waiter *waiter);
-                                  struct wake_q_head *wqh);
-extern void rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+                                 struct wake_q_head *wqh);
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 90a74ccd85a4..4d48b1c4870d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write);
 */
 void downgrade_write(struct rw_semaphore *sem)
 {
-        /*
+        lock_downgrade(&sem->dep_map, _RET_IP_);
-         * lockdep: a downgraded write will live on as a write
-         * dependency.
-         */
        rwsem_set_reader_owned(sem);
        __downgrade_write(sem);
 }
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 6b7abb334ca6..39f56c870051 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus)
 struct stress {
        struct work_struct work;
        struct ww_mutex *locks;
+        unsigned long timeout;
        int nlocks;
-        int nloops;
 };
 static int *get_random_order(int count)
@@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work)
        if (!order)
                return;
-        ww_acquire_init(&ctx, &ww_class);
        do {
                int contended = -1;
                int n, err;
+                ww_acquire_init(&ctx, &ww_class);
 retry:
                err = 0;
                for (n = 0; n < nlocks; n++) {
@@ -433,9 +432,9 @@ retry:
                                    __func__, err);
                        break;
                }
-        } while (--stress->nloops);
-        ww_acquire_fini(&ctx);
+                ww_acquire_fini(&ctx);
+        } while (!time_after(jiffies, stress->timeout));
        kfree(order);
        kfree(stress);
@@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work)
        kfree(order);
        order = NULL;
-        ww_acquire_init(&ctx, &ww_class);
        do {
+                ww_acquire_init(&ctx, &ww_class);
                list_for_each_entry(ll, &locks, link) {
                        err = ww_mutex_lock(ll->lock, &ctx);
                        if (!err)
@@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work)
                dummy_load(stress);
                list_for_each_entry(ll, &locks, link)
                        ww_mutex_unlock(ll->lock);
-        } while (--stress->nloops);
-        ww_acquire_fini(&ctx);
+                ww_acquire_fini(&ctx);
+        } while (!time_after(jiffies, stress->timeout));
 out:
        list_for_each_entry_safe(ll, ln, &locks, link)
@@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work)
                                    __func__, err);
                        break;
                }
-        } while (--stress->nloops);
+        } while (!time_after(jiffies, stress->timeout));
        kfree(stress);
 }
@@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work)
 #define STRESS_ONE BIT(2)
 #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
-static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
+static int stress(int nlocks, int nthreads, unsigned int flags)
 {
        struct ww_mutex *locks;
        int n;
@@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
                INIT_WORK(&stress->work, fn);
                stress->locks = locks;
                stress->nlocks = nlocks;
-                stress->nloops = nloops;
+                stress->timeout = jiffies + 2*HZ;
                queue_work(wq, &stress->work);
                nthreads--;
@@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void)
        if (ret)
                return ret;
-        ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER);
+        ret = stress(16, 2*ncpus, STRESS_INORDER);
        if (ret)
                return ret;
-        ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER);
+        ret = stress(16, 2*ncpus, STRESS_REORDER);
        if (ret)
                return ret;
-        ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+        ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
        if (ret)
                return ret;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..23a6483c3666 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
        struct vmem_altmap altmap;
 };
-void get_zone_device_page(struct page *page)
-{
-        percpu_ref_get(page->pgmap->ref);
-}
-EXPORT_SYMBOL(get_zone_device_page);
-void put_zone_device_page(struct page *page)
-{
-        put_dev_pagemap(page->pgmap);
-}
-EXPORT_SYMBOL(put_zone_device_page);
 static void pgmap_radix_release(struct resource *res)
 {
        resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
        struct resource *res = &page_map->res;
        resource_size_t align_start, align_size;
        struct dev_pagemap *pgmap = &page_map->pgmap;
+        unsigned long pfn;
+        for_each_device_pfn(pfn, page_map)
+                put_page(pfn_to_page(pfn));
        if (percpu_ref_tryget_live(pgmap->ref)) {
                dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 *
 * Notes:
 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
- *    (or devm release event).
+ *    (or devm release event). The expected order of events is that @ref has
+ *    been through percpu_ref_kill() before devm_memremap_pages_release(). The
+ *    wait for the completion of all references being dropped and
+ *    percpu_ref_exit() must occur after devm_memremap_pages_release().
 *
 * 2/ @res is expected to be a host memory range that could feasibly be
 *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
                 */
                list_del(&page->lru);
                page->pgmap = pgmap;
+                percpu_ref_get(ref);
        }
        devres_add(dev, page_map);
        return __va(res->start);
diff --git a/kernel/module.c b/kernel/module.c
index 7eba6dea4f41..4a3665f8f837 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,6 +49,9 @@
 #include <linux/rculist.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
+#ifdef CONFIG_STRICT_MODULE_RWX
+#include <asm/set_memory.h>
+#endif
 #include <asm/mmu_context.h>
 #include <linux/license.h>
 #include <asm/sections.h>
@@ -665,16 +668,7 @@ static void percpu_modcopy(struct module *mod,
                memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
 }
-/**
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
 {
        struct module *mod;
        unsigned int cpu;
@@ -688,9 +682,15 @@ bool is_module_percpu_address(unsigned long addr)
                        continue;
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(mod->percpu, cpu);
+                        void *va = (void *)addr;
-                        if ((void *)addr >= start &&
-                            (void *)addr < start + mod->percpu_size) {
+                        if (va >= start && va < start + mod->percpu_size) {
+                                if (can_addr) {
+                                        *can_addr = (unsigned long) (va - start);
+                                        *can_addr += (unsigned long)
+                                                per_cpu_ptr(mod->percpu,
+                                                            get_boot_cpu_id());
+                                }
                                preempt_enable();
                                return true;
                        }
@@ -701,6 +701,20 @@ bool is_module_percpu_address(unsigned long addr)
        return false;
 }
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+        return __is_module_percpu_address(addr, NULL);
+}
 #else /* ... !CONFIG_SMP */
 static inline void __percpu *mod_percpu(struct module *mod)
@@ -732,6 +746,11 @@ bool is_module_percpu_address(unsigned long addr)
        return false;
 }
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+        return false;
+}
 #endif /* CONFIG_SMP */
 #define MODINFO_ATTR(field)     \
@@ -947,6 +966,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
+        audit_log_kern_module(name);
        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
@@ -2846,7 +2867,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
        /* Suck in entire file: we'll want most of it. */
        info->hdr = __vmalloc(info->len,
-                        GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL);
+                        GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
        if (!info->hdr)
                return -ENOMEM;
@@ -4017,7 +4038,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
        /* Don't lock: we're in enough trouble already. */
        preempt_disable();
-        if ((colon = strchr(name, ':')) != NULL) {
+        if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
                if ((mod = find_module_all(name, colon - name, false)) != NULL)
                        ret = mod_find_symname(mod, colon+1);
        } else {
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e59eed..f6c5d330059a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/syscalls.h>
 #include <linux/cgroup.h>
+#include <linux/perf_event.h>
 static struct kmem_cache *nsproxy_cachep;
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
                goto out;
        }
        switch_task_namespaces(tsk, new_nsproxy);
+        perf_event_namespaces(tsk);
 out:
        fput(file);
        return err;
diff --git a/kernel/padata.c b/kernel/padata.c
index 3202aa17492c..ac8f1e524836 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -154,8 +154,6 @@ EXPORT_SYMBOL(padata_do_parallel);
 * A pointer to the control struct of the next object that needs
 * serialization, if present in one of the percpu reorder queues.
 *
- * NULL, if all percpu reorder queues are empty.
- *
 * -EINPROGRESS, if the next object that needs serialization will
 *  be parallel processed by another cpu and is not yet present in
 *  the cpu's reorder queue.
@@ -182,8 +180,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
        cpu = padata_index_to_cpu(pd, next_index);
        next_queue = per_cpu_ptr(pd->pqueue, cpu);
-        padata = NULL;
        reorder = &next_queue->reorder;
        spin_lock(&reorder->lock);
@@ -235,12 +231,11 @@ static void padata_reorder(struct parallel_data *pd)
                padata = padata_get_next(pd);
                /*
-                 * All reorder queues are empty, or the next object that needs
+                 * If the next object that needs serialization is parallel
-                 * serialization is parallel processed by another cpu and is
+                 * processed by another cpu and is still on it's way to the
-                 * still on it's way to the cpu's reorder queue, nothing to
+                 * cpu's reorder queue, nothing to do for now.
-                 * do for now.
                 */
-                if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+                if (PTR_ERR(padata) == -EINPROGRESS)
                        break;
                /*
@@ -354,7 +349,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
        cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
        if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
-                free_cpumask_var(pd->cpumask.cbcpu);
+                free_cpumask_var(pd->cpumask.pcpu);
                return -ENOMEM;
        }
diff --git a/kernel/params.c b/kernel/params.c
index a6d6149c0fe6..60b2d8101355 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -160,58 +160,6 @@ static int parse_one(char *param,
        return -ENOENT;
 }
-/* You can use " around spaces, but can't escape ". */
-/* Hyphens and underscores equivalent in parameter names. */
-static char *next_arg(char *args, char **param, char **val)
-{
-        unsigned int i, equals = 0;
-        int in_quote = 0, quoted = 0;
-        char *next;
-        if (*args == '"') {
-                args++;
-                in_quote = 1;
-                quoted = 1;
-        }
-        for (i = 0; args[i]; i++) {
-                if (isspace(args[i]) && !in_quote)
-                        break;
-                if (equals == 0) {
-                        if (args[i] == '=')
-                                equals = i;
-                }
-                if (args[i] == '"')
-                        in_quote = !in_quote;
-        }
-        *param = args;
-        if (!equals)
-                *val = NULL;
-        else {
-                args[equals] = '\0';
-                *val = args + equals + 1;
-                /* Don't include quotes in value. */
-                if (**val == '"') {
-                        (*val)++;
-                        if (args[i-1] == '"')
-                                args[i-1] = '\0';
-                }
-        }
-        if (quoted && args[i-1] == '"')
-                args[i-1] = '\0';
-        if (args[i]) {
-                args[i] = '\0';
-                next = args + i + 1;
-        } else
-                next = args + i;
-        /* Chew up trailing spaces. */
-        return skip_spaces(next);
-}
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
 char *parse_args(const char *doing,
                 char *args,
diff --git a/kernel/pid.c b/kernel/pid.c
index 0143ac0ddceb..fd1cde1e4576 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -321,8 +321,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        }
        if (unlikely(is_child_reaper(pid))) {
-                if (pid_ns_prepare_proc(ns))
+                if (pid_ns_prepare_proc(ns)) {
+                        disable_pid_allocation(ns);
                        goto out_free;
+                }
        }
        get_pid_ns(ns);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..74a5a7255b4d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
         * if reparented.
         */
        for (;;) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
+                set_current_state(TASK_INTERRUPTIBLE);
                if (pid_ns->nr_hashed == init_pids)
                        break;
                schedule();
@@ -374,6 +374,29 @@ static struct ns_common *pidns_get(struct task_struct *task)
        return ns ? &ns->ns : NULL;
 }
+static struct ns_common *pidns_for_children_get(struct task_struct *task)
+{
+        struct pid_namespace *ns = NULL;
+        task_lock(task);
+        if (task->nsproxy) {
+                ns = task->nsproxy->pid_ns_for_children;
+                get_pid_ns(ns);
+        }
+        task_unlock(task);
+        if (ns) {
+                read_lock(&tasklist_lock);
+                if (!ns->child_reaper) {
+                        put_pid_ns(ns);
+                        ns = NULL;
+                }
+                read_unlock(&tasklist_lock);
+        }
+        return ns ? &ns->ns : NULL;
+}
 static void pidns_put(struct ns_common *ns)
 {
        put_pid_ns(to_pid_ns(ns));
@@ -443,6 +466,17 @@ const struct proc_ns_operations pidns_operations = {
        .get_parent     = pidns_get_parent,
 };
+const struct proc_ns_operations pidns_for_children_operations = {
+        .name           = "pid_for_children",
+        .real_ns_name   = "pid",
+        .type           = CLONE_NEWPID,
+        .get            = pidns_for_children_get,
+        .put            = pidns_put,
+        .install        = pidns_install,
+        .owner          = pidns_owner,
+        .get_parent     = pidns_get_parent,
+};
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index c7209f060eeb..78672d324a6e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,7 +132,7 @@ int freeze_processes(void)
        if (!pm_freezing)
                atomic_inc(&system_freezing_cnt);
-        pm_wakeup_clear();
+        pm_wakeup_clear(true);
        pr_info("Freezing user space processes ... ");
        pm_freezing = true;
        error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d79a38de425a..fa46606f3356 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,6 +36,9 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
+#ifdef CONFIG_STRICT_KERNEL_RWX
+#include <asm/set_memory.h>
+#endif
 #include "power.h"
@@ -1422,7 +1425,7 @@ static unsigned int nr_meta_pages;
 * Numbers of normal and highmem page frames allocated for hibernation image
 * before suspending devices.
 */
-unsigned int alloc_normal, alloc_highmem;
+static unsigned int alloc_normal, alloc_highmem;
 /*
 * Memory bitmap used for marking saveable pages (during hibernation) or
 * hibernation image pages (during restore)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 15e6baef5c73..c0248c74d6d4 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -72,6 +72,8 @@ static void freeze_begin(void)
 static void freeze_enter(void)
 {
+        trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
        spin_lock_irq(&suspend_freeze_lock);
        if (pm_wakeup_pending())
                goto out;
@@ -98,6 +100,27 @@ static void freeze_enter(void)
 out:
        suspend_freeze_state = FREEZE_STATE_NONE;
        spin_unlock_irq(&suspend_freeze_lock);
+        trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+}
+static void s2idle_loop(void)
+{
+        do {
+                freeze_enter();
+                if (freeze_ops && freeze_ops->wake)
+                        freeze_ops->wake();
+                dpm_resume_noirq(PMSG_RESUME);
+                if (freeze_ops && freeze_ops->sync)
+                        freeze_ops->sync();
+                if (pm_wakeup_pending())
+                        break;
+                pm_wakeup_clear(false);
+        } while (!dpm_suspend_noirq(PMSG_SUSPEND));
 }
 void freeze_wake(void)
@@ -371,10 +394,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
         * all the devices are suspended.
         */
        if (state == PM_SUSPEND_FREEZE) {
-                trace_suspend_resume(TPS("machine_suspend"), state, true);
+                s2idle_loop();
-                freeze_enter();
+                goto Platform_early_resume;
-                trace_suspend_resume(TPS("machine_suspend"), state, false);
-                goto Platform_wake;
        }
        error = disable_nonboot_cpus();
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index d5760c42f042..61d41ca41844 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -2,12 +2,13 @@
 #include <linux/kernel.h>
 #include <linux/console.h>
+#include <linux/errno.h>
 #include <linux/string.h>
 #include "console_cmdline.h"
 #include "braille.h"
-char *_braille_console_setup(char **str, char **brl_options)
+int _braille_console_setup(char **str, char **brl_options)
 {
        if (!strncmp(*str, "brl,", 4)) {
                *brl_options = "";
@@ -15,14 +16,14 @@ char *_braille_console_setup(char **str, char **brl_options)
        } else if (!strncmp(*str, "brl=", 4)) {
                *brl_options = *str + 4;
                *str = strchr(*brl_options, ',');
-                if (!*str)
+                if (!*str) {
                        pr_err("need port name after brl=\n");
-                else
+                        return -EINVAL;
-                        *((*str)++) = 0;
+                }
-        } else
+                *((*str)++) = 0;
-                return NULL;
+        }
-        return *str;
+        return 0;
 }
 int
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
index 769d771145c8..749a6756843a 100644
--- a/kernel/printk/braille.h
+++ b/kernel/printk/braille.h
@@ -9,7 +9,14 @@ braille_set_options(struct console_cmdline *c, char *brl_options)
        c->brl_options = brl_options;
 }
-char *
+/*
+ * Setup console according to braille options.
+ * Return -EINVAL on syntax error, 0 on success (or no braille option was
+ * actually given).
+ * Modifies str to point to the serial options
+ * Sets brl_options to the parsed braille options.
+ */
+int
 _braille_console_setup(char **str, char **brl_options);
 int
@@ -25,10 +32,10 @@ braille_set_options(struct console_cmdline *c, char *brl_options)
 {
 }
-static inline char *
+static inline int
 _braille_console_setup(char **str, char **brl_options)
 {
-        return NULL;
+        return 0;
 }
 static inline int
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2984fb0f0257..a1aecf44ab07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,7 @@
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/syscalls.h>
-#include <linux/kexec.h>
+#include <linux/crash_core.h>
 #include <linux/kdb.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
@@ -269,8 +269,8 @@ static struct console *exclusive_console;
 #define MAX_CMDLINECONSOLES 8
 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
+static int console_cmdline_cnt;
-static int selected_console = -1;
 static int preferred_console = -1;
 int console_set_on_cmdline;
 EXPORT_SYMBOL(console_set_on_cmdline);
@@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = {
        .release = devkmsg_release,
 };
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_CORE
 /*
 * This appends the listed symbols to /proc/vmcore
 *
@@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = {
 * symbols are specifically used so that utilities can access and extract the
 * dmesg log from a vmcore file after a crash.
 */
-void log_buf_kexec_setup(void)
+void log_buf_vmcoreinfo_setup(void)
 {
        VMCOREINFO_SYMBOL(log_buf);
        VMCOREINFO_SYMBOL(log_buf_len);
@@ -1906,24 +1906,38 @@ static int __add_preferred_console(char *name, int idx, char *options,
         *      See if this tty is not yet registered, and
         *      if we have a slot free.
         */
-        for (i = 0, c = console_cmdline;
+        for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) {
-             i < MAX_CMDLINECONSOLES && c->name[0];
-             i++, c++) {
                if (strcmp(c->name, name) == 0 && c->index == idx) {
-                        if (!brl_options)
+                        if (brl_options)
-                                selected_console = i;
+                                return 0;
+                        /*
+                         * Maintain an invariant that will help to find if
+                         * the matching console is preferred, see
+                         * register_console():
+                         *
+                         * The last non-braille console is always
+                         * the preferred one.
+                         */
+                        if (i != console_cmdline_cnt - 1)
+                                swap(console_cmdline[i],
+                                     console_cmdline[console_cmdline_cnt - 1]);
+                        preferred_console = console_cmdline_cnt - 1;
                        return 0;
                }
        }
        if (i == MAX_CMDLINECONSOLES)
                return -E2BIG;
        if (!brl_options)
-                selected_console = i;
+                preferred_console = i;
        strlcpy(c->name, name, sizeof(c->name));
        c->options = options;
        braille_set_options(c, brl_options);
        c->index = idx;
+        console_cmdline_cnt++;
        return 0;
 }
 /*
@@ -2031,15 +2045,16 @@ void resume_console(void)
 * @cpu: unused
 *
 * If printk() is called from a CPU that is not online yet, the messages
- * will be spooled but will not show up on the console.  This function is
+ * will be printed on the console only if there are CON_ANYTIME consoles.
- * called when a new CPU comes online (or fails to come up), and ensures
+ * This function is called when a new CPU comes online (or fails to come
- * that any such output gets printed.
+ * up) or goes offline.
 */
 static int console_cpu_notify(unsigned int cpu)
 {
        if (!cpuhp_tasks_frozen) {
-                console_lock();
+                /* If trylock fails, someone else is doing the printing */
-                console_unlock();
+                if (console_trylock())
+                        console_unlock();
        }
        return 0;
 }
@@ -2161,7 +2176,7 @@ void console_unlock(void)
        }
        /*
-         * Console drivers are called under logbuf_lock, so
+         * Console drivers are called with interrupts disabled, so
         * @console_may_schedule should be cleared before; however, we may
         * end up dumping a lot of lines, for example, if called from
         * console registration path, and should invoke cond_resched()
@@ -2169,11 +2184,15 @@ void console_unlock(void)
         * scheduling stall on a slow console leading to RCU stall and
         * softlockup warnings which exacerbate the issue with more
         * messages practically incapacitating the system.
+         *
+         * console_trylock() is not able to detect the preemptive
+         * context reliably. Therefore the value must be stored before
+         * and cleared after the the "again" goto label.
         */
        do_cond_resched = console_may_schedule;
+again:
        console_may_schedule = 0;
-again:
        /*
         * We released the console_sem lock, so we need to recheck if
         * cpu is online and (if not) is there at least one CON_ANYTIME
@@ -2409,6 +2428,7 @@ void register_console(struct console *newcon)
        unsigned long flags;
        struct console *bcon = NULL;
        struct console_cmdline *c;
+        static bool has_preferred;
        if (console_drivers)
                for_each_console(bcon)
@@ -2435,15 +2455,15 @@ void register_console(struct console *newcon)
        if (console_drivers && console_drivers->flags & CON_BOOT)
                bcon = console_drivers;
-        if (preferred_console < 0 || bcon || !console_drivers)
+        if (!has_preferred || bcon || !console_drivers)
-                preferred_console = selected_console;
+                has_preferred = preferred_console >= 0;
        /*
         *      See if we want to use this console driver. If we
         *      didn't select a console we take the first one
         *      that registers here.
         */
-        if (preferred_console < 0) {
+        if (!has_preferred) {
                if (newcon->index < 0)
                        newcon->index = 0;
                if (newcon->setup == NULL ||
@@ -2451,18 +2471,29 @@ void register_console(struct console *newcon)
                        newcon->flags |= CON_ENABLED;
                        if (newcon->device) {
                                newcon->flags |= CON_CONSDEV;
-                                preferred_console = 0;
+                                has_preferred = true;
                        }
                }
        }
        /*
-         *      See if this console matches one we selected on
+         * See if this console matches one we selected on the command line.
-         *      the command line.
+         *
+         * There may be several entries in the console_cmdline array matching
+         * with the same console, one with newcon->match(), another by
+         * name/index:
+         *
+         *      pl011,mmio,0x87e024000000,115200 -- added from SPCR
+         *      ttyAMA0 -- added from command line
+         *
+         * Traverse the console_cmdline array in reverse order to be
+         * sure that if this console is preferred then it will be the first
+         * matching entry.  We use the invariant that is maintained in
+         * __add_preferred_console().
         */
-        for (i = 0, c = console_cmdline;
+        for (i = console_cmdline_cnt - 1; i >= 0; i--) {
-             i < MAX_CMDLINECONSOLES && c->name[0];
+                c = console_cmdline + i;
-             i++, c++) {
                if (!newcon->match ||
                    newcon->match(newcon, c->name, c->index, c->options) != 0) {
                        /* default matching */
@@ -2484,9 +2515,9 @@ void register_console(struct console *newcon)
                }
                newcon->flags |= CON_ENABLED;
-                if (i == selected_console) {
+                if (i == preferred_console) {
                        newcon->flags |= CON_CONSDEV;
-                        preferred_console = selected_console;
+                        has_preferred = true;
                }
                break;
        }
@@ -2611,6 +2642,30 @@ int unregister_console(struct console *console)
 EXPORT_SYMBOL(unregister_console);
 /*
+ * Initialize the console device. This is called *early*, so
+ * we can't necessarily depend on lots of kernel help here.
+ * Just do some early initializations, and do the complex setup
+ * later.
+ */
+void __init console_init(void)
+{
+        initcall_t *call;
+        /* Setup the default TTY line discipline. */
+        n_tty_init();
+        /*
+         * set up the console device so that later boot sequences can
+         * inform about problems etc..
+         */
+        call = __con_initcall_start;
+        while (call < __con_initcall_end) {
+                (*call)();
+                call++;
+        }
+}
+/*
 * Some boot consoles access data that is in the init section and which will
 * be discarded after the initcalls have been run. To make sure that no code
 * will access this data, unregister the boot consoles in a late initcall.
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 18dfc485225c..23803c7d5180 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,10 +3,13 @@
 KCOV_INSTRUMENT := n
 obj-y += update.o sync.o
-obj-$(CONFIG_SRCU) += srcu.o
+obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
+obj-$(CONFIG_TREE_SRCU) += srcutree.o
+obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_PREEMPT_RCU) += tree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
+obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d6ff3e471be..73e16ec4054b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -56,6 +56,83 @@
 #define DYNTICK_TASK_EXIT_IDLE     (DYNTICK_TASK_NEST_VALUE + \
                                    DYNTICK_TASK_FLAG)
+/*
+ * Grace-period counter management.
+ */
+#define RCU_SEQ_CTR_SHIFT       2
+#define RCU_SEQ_STATE_MASK      ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+/*
+ * Return the counter portion of a sequence number previously returned
+ * by rcu_seq_snap() or rcu_seq_current().
+ */
+static inline unsigned long rcu_seq_ctr(unsigned long s)
+{
+        return s >> RCU_SEQ_CTR_SHIFT;
+}
+/*
+ * Return the state portion of a sequence number previously returned
+ * by rcu_seq_snap() or rcu_seq_current().
+ */
+static inline int rcu_seq_state(unsigned long s)
+{
+        return s & RCU_SEQ_STATE_MASK;
+}
+/*
+ * Set the state portion of the pointed-to sequence number.
+ * The caller is responsible for preventing conflicting updates.
+ */
+static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
+{
+        WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
+        WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
+}
+/* Adjust sequence number for start of update-side operation. */
+static inline void rcu_seq_start(unsigned long *sp)
+{
+        WRITE_ONCE(*sp, *sp + 1);
+        smp_mb(); /* Ensure update-side operation after counter increment. */
+        WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
+}
+/* Adjust sequence number for end of update-side operation. */
+static inline void rcu_seq_end(unsigned long *sp)
+{
+        smp_mb(); /* Ensure update-side operation before counter increment. */
+        WARN_ON_ONCE(!rcu_seq_state(*sp));
+        WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
+}
+/* Take a snapshot of the update side's sequence number. */
+static inline unsigned long rcu_seq_snap(unsigned long *sp)
+{
+        unsigned long s;
+        s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK;
+        smp_mb(); /* Above access must not bleed into critical section. */
+        return s;
+}
+/* Return the current value the update side's sequence number, no ordering. */
+static inline unsigned long rcu_seq_current(unsigned long *sp)
+{
+        return READ_ONCE(*sp);
+}
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+        return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
 /*
 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
 * by call_rcu() and rcu callback execution, and are therefore not part of the
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
        rcu_lock_acquire(&rcu_callback_map);
        if (__is_kfree_rcu_offset(offset)) {
-                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
                kfree((void *)head - offset);
                rcu_lock_release(&rcu_callback_map);
                return true;
        } else {
-                RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+                RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
                head->func(head);
                rcu_lock_release(&rcu_callback_map);
                return false;
@@ -144,4 +221,76 @@ void rcu_test_sync_prims(void);
 */
 extern void resched_cpu(int cpu);
+#if defined(SRCU) || !defined(TINY_RCU)
+#include <linux/rcu_node_tree.h>
+extern int rcu_num_lvls;
+extern int num_rcu_lvl[];
+extern int rcu_num_nodes;
+static bool rcu_fanout_exact;
+static int rcu_fanout_leaf;
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
+ */
+static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
+{
+        int i;
+        if (rcu_fanout_exact) {
+                levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+                for (i = rcu_num_lvls - 2; i >= 0; i--)
+                        levelspread[i] = RCU_FANOUT;
+        } else {
+                int ccur;
+                int cprv;
+                cprv = nr_cpu_ids;
+                for (i = rcu_num_lvls - 1; i >= 0; i--) {
+                        ccur = levelcnt[i];
+                        levelspread[i] = (cprv + ccur - 1) / ccur;
+                        cprv = ccur;
+                }
+        }
+}
+/*
+ * Do a full breadth-first scan of the rcu_node structures for the
+ * specified rcu_state structure.
+ */
+#define rcu_for_each_node_breadth_first(rsp, rnp) \
+        for ((rnp) = &(rsp)->node[0]; \
+             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+/*
+ * Do a breadth-first scan of the non-leaf rcu_node structures for the
+ * specified rcu_state structure.  Note that if there is a singleton
+ * rcu_node tree with but one rcu_node structure, this loop is a no-op.
+ */
+#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+        for ((rnp) = &(rsp)->node[0]; \
+             (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+/*
+ * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
+ * structure.  Note that if there is a singleton rcu_node tree with but
+ * one rcu_node structure, this loop -will- visit the rcu_node structure.
+ * It is still a leaf node, even if it is also the root node.
+ */
+#define rcu_for_each_leaf_node(rsp, rnp) \
+        for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+/*
+ * Iterate over all possible CPUs in a leaf RCU node.
+ */
+#define for_each_leaf_node_possible_cpu(rnp, cpu) \
+        for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
+             cpu <= rnp->grphi; \
+             cpu = cpumask_next((cpu), cpu_possible_mask))
+#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
new file mode 100644
index 000000000000..2b62a38b080f
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.c
@@ -0,0 +1,505 @@
+/*
+ * RCU segmented callback lists, function definitions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include "rcu_segcblist.h"
+/* Initialize simple callback list. */
+void rcu_cblist_init(struct rcu_cblist *rclp)
+{
+        rclp->head = NULL;
+        rclp->tail = &rclp->head;
+        rclp->len = 0;
+        rclp->len_lazy = 0;
+}
+/*
+ * Debug function to actually count the number of callbacks.
+ * If the number exceeds the limit specified, return -1.
+ */
+long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
+{
+        int cnt = 0;
+        struct rcu_head **rhpp = &rclp->head;
+        for (;;) {
+                if (!*rhpp)
+                        return cnt;
+                if (++cnt > lim)
+                        return -1;
+                rhpp = &(*rhpp)->next;
+        }
+}
+/*
+ * Dequeue the oldest rcu_head structure from the specified callback
+ * list.  This function assumes that the callback is non-lazy, but
+ * the caller can later invoke rcu_cblist_dequeued_lazy() if it
+ * finds otherwise (and if it cares about laziness).  This allows
+ * different users to have different ways of determining laziness.
+ */
+struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
+{
+        struct rcu_head *rhp;
+        rhp = rclp->head;
+        if (!rhp)
+                return NULL;
+        rclp->len--;
+        rclp->head = rhp->next;
+        if (!rclp->head)
+                rclp->tail = &rclp->head;
+        return rhp;
+}
+/*
+ * Initialize an rcu_segcblist structure.
+ */
+void rcu_segcblist_init(struct rcu_segcblist *rsclp)
+{
+        int i;
+        BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
+        BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
+        rsclp->head = NULL;
+        for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+                rsclp->tails[i] = &rsclp->head;
+        rsclp->len = 0;
+        rsclp->len_lazy = 0;
+}
+/*
+ * Disable the specified rcu_segcblist structure, so that callbacks can
+ * no longer be posted to it.  This structure must be empty.
+ */
+void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
+{
+        WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
+        WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
+        WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
+        rsclp->tails[RCU_NEXT_TAIL] = NULL;
+}
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+        if (seg == RCU_DONE_TAIL)
+                return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+        return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are ready to be invoked?
+ */
+bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
+{
+        return rcu_segcblist_is_enabled(rsclp) &&
+               &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+}
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are still pending, that is, not yet ready to be invoked?
+ */
+bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
+{
+        return rcu_segcblist_is_enabled(rsclp) &&
+               !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
+}
+/*
+ * Dequeue and return the first ready-to-invoke callback.  If there
+ * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
+ * to avoid interference.  Does not protect from interference from other
+ * CPUs or tasks.
+ */
+struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
+{
+        unsigned long flags;
+        int i;
+        struct rcu_head *rhp;
+        local_irq_save(flags);
+        if (!rcu_segcblist_ready_cbs(rsclp)) {
+                local_irq_restore(flags);
+                return NULL;
+        }
+        rhp = rsclp->head;
+        BUG_ON(!rhp);
+        rsclp->head = rhp->next;
+        for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
+                if (rsclp->tails[i] != &rhp->next)
+                        break;
+                rsclp->tails[i] = &rsclp->head;
+        }
+        smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
+        WRITE_ONCE(rsclp->len, rsclp->len - 1);
+        local_irq_restore(flags);
+        return rhp;
+}
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        rsclp->len_lazy--;
+        local_irq_restore(flags);
+}
+/*
+ * Return a pointer to the first callback in the specified rcu_segcblist
+ * structure.  This is useful for diagnostics.
+ */
+struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
+{
+        if (rcu_segcblist_is_enabled(rsclp))
+                return rsclp->head;
+        return NULL;
+}
+/*
+ * Return a pointer to the first pending callback in the specified
+ * rcu_segcblist structure.  This is useful just after posting a given
+ * callback -- if that callback is the first pending callback, then
+ * you cannot rely on someone else having already started up the required
+ * grace period.
+ */
+struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
+{
+        if (rcu_segcblist_is_enabled(rsclp))
+                return *rsclp->tails[RCU_DONE_TAIL];
+        return NULL;
+}
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * have not yet been processed beyond having been posted, that is,
+ * does it contain callbacks in its last segment?
+ */
+bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
+{
+        return rcu_segcblist_is_enabled(rsclp) &&
+               !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
+}
+/*
+ * Enqueue the specified callback onto the specified rcu_segcblist
+ * structure, updating accounting as needed.  Note that the ->len
+ * field may be accessed locklessly, hence the WRITE_ONCE().
+ * The ->len field is used by rcu_barrier() and friends to determine
+ * if it must post a callback on this structure, and it is OK
+ * for rcu_barrier() to sometimes post callbacks needlessly, but
+ * absolutely not OK for it to ever miss posting a callback.
+ */
+void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+                           struct rcu_head *rhp, bool lazy)
+{
+        WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+        if (lazy)
+                rsclp->len_lazy++;
+        smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+        rhp->next = NULL;
+        *rsclp->tails[RCU_NEXT_TAIL] = rhp;
+        rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+}
+/*
+ * Entrain the specified callback onto the specified rcu_segcblist at
+ * the end of the last non-empty segment.  If the entire rcu_segcblist
+ * is empty, make no change, but return false.
+ *
+ * This is intended for use by rcu_barrier()-like primitives, -not-
+ * for normal grace-period use.  IMPORTANT:  The callback you enqueue
+ * will wait for all prior callbacks, NOT necessarily for a grace
+ * period.  You have been warned.
+ */
+bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+                           struct rcu_head *rhp, bool lazy)
+{
+        int i;
+        if (rcu_segcblist_n_cbs(rsclp) == 0)
+                return false;
+        WRITE_ONCE(rsclp->len, rsclp->len + 1);
+        if (lazy)
+                rsclp->len_lazy++;
+        smp_mb(); /* Ensure counts are updated before callback is entrained. */
+        rhp->next = NULL;
+        for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
+                if (rsclp->tails[i] != rsclp->tails[i - 1])
+                        break;
+        *rsclp->tails[i] = rhp;
+        for (; i <= RCU_NEXT_TAIL; i++)
+                rsclp->tails[i] = &rhp->next;
+        return true;
+}
+/*
+ * Extract only the counts from the specified rcu_segcblist structure,
+ * and place them in the specified rcu_cblist structure.  This function
+ * supports both callback orphaning and invocation, hence the separation
+ * of counts and callbacks.  (Callbacks ready for invocation must be
+ * orphaned and adopted separately from pending callbacks, but counts
+ * apply to all callbacks.  Locking must be used to make sure that
+ * both orphaned-callbacks lists are consistent.)
+ */
+void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+                                               struct rcu_cblist *rclp)
+{
+        rclp->len_lazy += rsclp->len_lazy;
+        rclp->len += rsclp->len;
+        rsclp->len_lazy = 0;
+        WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+}
+/*
+ * Extract only those callbacks ready to be invoked from the specified
+ * rcu_segcblist structure and place them in the specified rcu_cblist
+ * structure.
+ */
+void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+                                    struct rcu_cblist *rclp)
+{
+        int i;
+        if (!rcu_segcblist_ready_cbs(rsclp))
+                return; /* Nothing to do. */
+        *rclp->tail = rsclp->head;
+        rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
+        *rsclp->tails[RCU_DONE_TAIL] = NULL;
+        rclp->tail = rsclp->tails[RCU_DONE_TAIL];
+        for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
+                if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
+                        rsclp->tails[i] = &rsclp->head;
+}
+/*
+ * Extract only those callbacks still pending (not yet ready to be
+ * invoked) from the specified rcu_segcblist structure and place them in
+ * the specified rcu_cblist structure.  Note that this loses information
+ * about any callbacks that might have been partway done waiting for
+ * their grace period.  Too bad!  They will have to start over.
+ */
+void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+                                    struct rcu_cblist *rclp)
+{
+        int i;
+        if (!rcu_segcblist_pend_cbs(rsclp))
+                return; /* Nothing to do. */
+        *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
+        rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
+        *rsclp->tails[RCU_DONE_TAIL] = NULL;
+        for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+                rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+}
+/*
+ * Insert counts from the specified rcu_cblist structure in the
+ * specified rcu_segcblist structure.
+ */
+void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+                                struct rcu_cblist *rclp)
+{
+        rsclp->len_lazy += rclp->len_lazy;
+        /* ->len sampled locklessly. */
+        WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+        rclp->len_lazy = 0;
+        rclp->len = 0;
+}
+/*
+ * Move callbacks from the specified rcu_cblist to the beginning of the
+ * done-callbacks segment of the specified rcu_segcblist.
+ */
+void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+                                   struct rcu_cblist *rclp)
+{
+        int i;
+        if (!rclp->head)
+                return; /* No callbacks to move. */
+        *rclp->tail = rsclp->head;
+        rsclp->head = rclp->head;
+        for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+                if (&rsclp->head == rsclp->tails[i])
+                        rsclp->tails[i] = rclp->tail;
+                else
+                        break;
+        rclp->head = NULL;
+        rclp->tail = &rclp->head;
+}
+/*
+ * Move callbacks from the specified rcu_cblist to the end of the
+ * new-callbacks segment of the specified rcu_segcblist.
+ */
+void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+                                   struct rcu_cblist *rclp)
+{
+        if (!rclp->head)
+                return; /* Nothing to do. */
+        *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
+        rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+        rclp->head = NULL;
+        rclp->tail = &rclp->head;
+}
+/*
+ * Advance the callbacks in the specified rcu_segcblist structure based
+ * on the current value passed in for the grace-period counter.
+ */
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+        int i, j;
+        WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+        if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+                return;
+        /*
+         * Find all callbacks whose ->gp_seq numbers indicate that they
+         * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+         */
+        for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+                if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+                        break;
+                rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+        }
+        /* If no callbacks moved, nothing more need be done. */
+        if (i == RCU_WAIT_TAIL)
+                return;
+        /* Clean up tail pointers that might have been misordered above. */
+        for (j = RCU_WAIT_TAIL; j < i; j++)
+                rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+        /*
+         * Callbacks moved, so clean up the misordered ->tails[] pointers
+         * that now point into the middle of the list of ready-to-invoke
+         * callbacks.  The overall effect is to copy down the later pointers
+         * into the gap that was created by the now-ready segments.
+         */
+        for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+                if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+                        break;  /* No more callbacks. */
+                rsclp->tails[j] = rsclp->tails[i];
+                rsclp->gp_seq[j] = rsclp->gp_seq[i];
+        }
+}
+/*
+ * "Accelerate" callbacks based on more-accurate grace-period information.
+ * The reason for this is that RCU does not synchronize the beginnings and
+ * ends of grace periods, and that callbacks are posted locally.  This in
+ * turn means that the callbacks must be labelled conservatively early
+ * on, as getting exact information would degrade both performance and
+ * scalability.  When more accurate grace-period information becomes
+ * available, previously posted callbacks can be "accelerated", marking
+ * them to complete at the end of the earlier grace period.
+ *
+ * This function operates on an rcu_segcblist structure, and also the
+ * grace-period sequence number seq at which new callbacks would become
+ * ready to invoke.  Returns true if there are callbacks that won't be
+ * ready to invoke until seq, false otherwise.
+ */
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+        int i;
+        WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+        if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+                return false;
+        /*
+         * Find the segment preceding the oldest segment of callbacks
+         * whose ->gp_seq[] completion is at or after that passed in via
+         * "seq", skipping any empty segments.  This oldest segment, along
+         * with any later segments, can be merged in with any newly arrived
+         * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
+         * as their ->gp_seq[] grace-period completion sequence number.
+         */
+        for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
+                if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+                    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+                        break;
+        /*
+         * If all the segments contain callbacks that correspond to
+         * earlier grace-period sequence numbers than "seq", leave.
+         * Assuming that the rcu_segcblist structure has enough
+         * segments in its arrays, this can only happen if some of
+         * the non-done segments contain callbacks that really are
+         * ready to invoke.  This situation will get straightened
+         * out by the next call to rcu_segcblist_advance().
+         *
+         * Also advance to the oldest segment of callbacks whose
+         * ->gp_seq[] completion is at or after that passed in via "seq",
+         * skipping any empty segments.
+         */
+        if (++i >= RCU_NEXT_TAIL)
+                return false;
+        /*
+         * Merge all later callbacks, including newly arrived callbacks,
+         * into the segment located by the for-loop above.  Assign "seq"
+         * as the ->gp_seq[] value in order to correctly handle the case
+         * where there were no pending callbacks in the rcu_segcblist
+         * structure other than in the RCU_NEXT_TAIL segment.
+         */
+        for (; i < RCU_NEXT_TAIL; i++) {
+                rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+                rsclp->gp_seq[i] = seq;
+        }
+        return true;
+}
+/*
+ * Scan the specified rcu_segcblist structure for callbacks that need
+ * a grace period later than the one specified by "seq".  We don't look
+ * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
+ * have a grace-period sequence number.
+ */
+bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+                                    unsigned long seq)
+{
+        int i;
+        for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+                if (rsclp->tails[i - 1] != rsclp->tails[i] &&
+                    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+                        return true;
+        return false;
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
new file mode 100644
index 000000000000..6e36e36478cd
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.h
@@ -0,0 +1,164 @@
+/*
+ * RCU segmented callback lists, internal-to-rcu header file
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+#include <linux/rcu_segcblist.h>
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
+{
+        rclp->len_lazy--;
+}
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
+{
+        return rclp->head;
+}
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
+{
+        WARN_ON_ONCE(!rclp->head);
+        return rclp->tail;
+}
+void rcu_cblist_init(struct rcu_cblist *rclp);
+long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
+struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
+/*
+ * Is the specified rcu_segcblist structure empty?
+ *
+ * But careful!  The fact that the ->head field is NULL does not
+ * necessarily imply that there are no callbacks associated with
+ * this structure.  When callbacks are being invoked, they are
+ * removed as a group.  If callback invocation must be preempted,
+ * the remaining callbacks will be added back to the list.  Either
+ * way, the counts are updated later.
+ *
+ * So it is often the case that rcu_segcblist_n_cbs() should be used
+ * instead.
+ */
+static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
+{
+        return !rsclp->head;
+}
+/* Return number of callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
+{
+        return READ_ONCE(rsclp->len);
+}
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
+{
+        return rsclp->len_lazy;
+}
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
+{
+        return rsclp->len - rsclp->len_lazy;
+}
+/*
+ * Is the specified rcu_segcblist enabled, for example, not corresponding
+ * to an offline or callback-offloaded CPU?
+ */
+static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
+{
+        return !!rsclp->tails[RCU_NEXT_TAIL];
+}
+/*
+ * Are all segments following the specified segment of the specified
+ * rcu_segcblist structure empty of callbacks?  (The specified
+ * segment might well contain callbacks.)
+ */
+static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
+{
+        return !*rsclp->tails[seg];
+}
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
+{
+        return rsclp->head;
+}
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
+{
+        WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
+        return rsclp->tails[RCU_NEXT_TAIL];
+}
+void rcu_segcblist_init(struct rcu_segcblist *rsclp);
+void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
+bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
+void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
+void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+                           struct rcu_head *rhp, bool lazy);
+bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+                           struct rcu_head *rhp, bool lazy);
+void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+                                 struct rcu_cblist *rclp);
+void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+                                    struct rcu_cblist *rclp);
+void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+                                    struct rcu_cblist *rclp);
+void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+                                struct rcu_cblist *rclp);
+void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+                                   struct rcu_cblist *rclp);
+void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+                                   struct rcu_cblist *rclp);
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
+bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+                                    unsigned long seq);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cccc417a8135..ae6e574d4cf5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -559,19 +559,34 @@ static void srcu_torture_barrier(void)
 static void srcu_torture_stats(void)
 {
-        int cpu;
+        int __maybe_unused cpu;
-        int idx = srcu_ctlp->completed & 0x1;
+        int idx;
-        pr_alert("%s%s per-CPU(idx=%d):",
+#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
+#ifdef CONFIG_TREE_SRCU
+        idx = srcu_ctlp->srcu_idx & 0x1;
+#else /* #ifdef CONFIG_TREE_SRCU */
+        idx = srcu_ctlp->completed & 0x1;
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
+        pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
                 torture_type, TORTURE_FLAG, idx);
        for_each_possible_cpu(cpu) {
                unsigned long l0, l1;
                unsigned long u0, u1;
                long c0, c1;
-                struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
+#ifdef CONFIG_TREE_SRCU
+                struct srcu_data *counts;
+                counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
+                u0 = counts->srcu_unlock_count[!idx];
+                u1 = counts->srcu_unlock_count[idx];
+#else /* #ifdef CONFIG_TREE_SRCU */
+                struct srcu_array *counts;
+                counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
                u0 = counts->unlock_count[!idx];
                u1 = counts->unlock_count[idx];
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
                /*
                 * Make sure that a lock is always counted if the corresponding
@@ -579,14 +594,26 @@ static void srcu_torture_stats(void)
                 */
                smp_rmb();
+#ifdef CONFIG_TREE_SRCU
+                l0 = counts->srcu_lock_count[!idx];
+                l1 = counts->srcu_lock_count[idx];
+#else /* #ifdef CONFIG_TREE_SRCU */
                l0 = counts->lock_count[!idx];
                l1 = counts->lock_count[idx];
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
                c0 = l0 - u0;
                c1 = l1 - u1;
                pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
        }
        pr_cont("\n");
+#elif defined(CONFIG_TINY_SRCU)
+        idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
+        pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
+                 torture_type, TORTURE_FLAG, idx,
+                 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
+                 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
+#endif
 }
 static void srcu_torture_synchronize_expedited(void)
@@ -1333,12 +1360,14 @@ rcu_torture_stats_print(void)
                cur_ops->stats();
        if (rtcv_snap == rcu_torture_current_version &&
            rcu_torture_current != NULL) {
-                int __maybe_unused flags;
+                int __maybe_unused flags = 0;
-                unsigned long __maybe_unused gpnum;
+                unsigned long __maybe_unused gpnum = 0;
-                unsigned long __maybe_unused completed;
+                unsigned long __maybe_unused completed = 0;
                rcutorture_get_gp_data(cur_ops->ttype,
                                       &flags, &gpnum, &completed);
+                srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+                                        &flags, &gpnum, &completed);
                wtp = READ_ONCE(writer_task);
                pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
                         rcu_torture_writer_state_getname(),
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index ef3bcfb15b39..584d8a983883 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -22,7 +22,7 @@
 *         Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
- *              Documentation/RCU/ *.txt
+ *              Documentation/RCU/ *.txt
 *
 */
@@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
 * @sp: structure to clean up.
 *
- * Must invoke this after you are finished using a given srcu_struct that
+ * Must invoke this only after you are finished using a given srcu_struct
- * was initialized via init_srcu_struct(), else you leak memory.
+ * that was initialized via init_srcu_struct().  This code does some
+ * probabalistic checking, spotting late uses of srcu_read_lock(),
+ * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
+ * If any such late uses are detected, the per-CPU memory associated with
+ * the srcu_struct is simply leaked and WARN_ON() is invoked.  If the
+ * caller frees the srcu_struct itself, a use-after-free crash will likely
+ * ensue, but at least there will be a warning printed.
 */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..36e1f82faed1
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,216 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *      tiny version for non-preemptible single-CPU use.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+#include <linux/rcu_node_tree.h>
+#include "rcu_segcblist.h"
+#include "rcu.h"
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+        sp->srcu_lock_nesting[0] = 0;
+        sp->srcu_lock_nesting[1] = 0;
+        init_swait_queue_head(&sp->srcu_wq);
+        sp->srcu_gp_seq = 0;
+        rcu_segcblist_init(&sp->srcu_cblist);
+        sp->srcu_gp_running = false;
+        sp->srcu_gp_waiting = false;
+        sp->srcu_idx = 0;
+        INIT_WORK(&sp->srcu_work, srcu_drive_gp);
+        return 0;
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+                       struct lock_class_key *key)
+{
+        /* Don't re-initialize a lock while it is held. */
+        debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+        lockdep_init_map(&sp->dep_map, name, key, 0);
+        return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/*
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+        return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/*
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+        WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
+        flush_work(&sp->srcu_work);
+        WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
+        WARN_ON(sp->srcu_gp_running);
+        WARN_ON(sp->srcu_gp_waiting);
+        WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+        int idx;
+        idx = READ_ONCE(sp->srcu_idx);
+        WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
+        return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+/*
+ * Removes the count for the old reader from the appropriate element of
+ * the srcu_struct.  Must be called from process context.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+        int newval = sp->srcu_lock_nesting[idx] - 1;
+        WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
+        if (!newval && READ_ONCE(sp->srcu_gp_waiting))
+                swake_up(&sp->srcu_wq);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+/*
+ * Workqueue handler to drive one grace period and invoke any callbacks
+ * that become ready as a result.  Single-CPU and !PREEMPT operation
+ * means that we get away with murder on synchronization.  ;-)
+ */
+void srcu_drive_gp(struct work_struct *wp)
+{
+        int idx;
+        struct rcu_cblist ready_cbs;
+        struct srcu_struct *sp;
+        struct rcu_head *rhp;
+        sp = container_of(wp, struct srcu_struct, srcu_work);
+        if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
+                return; /* Already running or nothing to do. */
+        /* Tag recently arrived callbacks and wait for readers. */
+        WRITE_ONCE(sp->srcu_gp_running, true);
+        rcu_segcblist_accelerate(&sp->srcu_cblist,
+                                 rcu_seq_snap(&sp->srcu_gp_seq));
+        rcu_seq_start(&sp->srcu_gp_seq);
+        idx = sp->srcu_idx;
+        WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
+        WRITE_ONCE(sp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
+        swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
+        WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+        rcu_seq_end(&sp->srcu_gp_seq);
+        /* Update callback list based on GP, and invoke ready callbacks. */
+        rcu_segcblist_advance(&sp->srcu_cblist,
+                              rcu_seq_current(&sp->srcu_gp_seq));
+        if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
+                rcu_cblist_init(&ready_cbs);
+                local_irq_disable();
+                rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
+                local_irq_enable();
+                rhp = rcu_cblist_dequeue(&ready_cbs);
+                for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+                        local_bh_disable();
+                        rhp->func(rhp);
+                        local_bh_enable();
+                }
+                local_irq_disable();
+                rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
+                local_irq_enable();
+        }
+        WRITE_ONCE(sp->srcu_gp_running, false);
+        /*
+         * If more callbacks, reschedule ourselves.  This can race with
+         * a call_srcu() at interrupt level, but the ->srcu_gp_running
+         * checks will straighten that out.
+         */
+        if (!rcu_segcblist_empty(&sp->srcu_cblist))
+                schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_drive_gp);
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+               rcu_callback_t func)
+{
+        unsigned long flags;
+        head->func = func;
+        local_irq_save(flags);
+        rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+        local_irq_restore(flags);
+        if (!READ_ONCE(sp->srcu_gp_running))
+                schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+/*
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+        struct rcu_synchronize rs;
+        init_rcu_head_on_stack(&rs.head);
+        init_completion(&rs.completion);
+        call_srcu(sp, &rs.head, wakeme_after_rcu);
+        wait_for_completion(&rs.completion);
+        destroy_rcu_head_on_stack(&rs.head);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..3ae8474557df
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,1155 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ *         Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ *              Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+#include "rcu.h"
+#include "rcu_segcblist.h"
+ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
+module_param(exp_holdoff, ulong, 0444);
+static void srcu_invoke_callbacks(struct work_struct *work);
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+/*
+ * Initialize SRCU combining tree.  Note that statically allocated
+ * srcu_struct structures might already have srcu_read_lock() and
+ * srcu_read_unlock() running against them.  So if the is_static parameter
+ * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ */
+static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
+{
+        int cpu;
+        int i;
+        int level = 0;
+        int levelspread[RCU_NUM_LVLS];
+        struct srcu_data *sdp;
+        struct srcu_node *snp;
+        struct srcu_node *snp_first;
+        /* Work out the overall tree geometry. */
+        sp->level[0] = &sp->node[0];
+        for (i = 1; i < rcu_num_lvls; i++)
+                sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
+        rcu_init_levelspread(levelspread, num_rcu_lvl);
+        /* Each pass through this loop initializes one srcu_node structure. */
+        rcu_for_each_node_breadth_first(sp, snp) {
+                spin_lock_init(&snp->lock);
+                WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
+                             ARRAY_SIZE(snp->srcu_data_have_cbs));
+                for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
+                        snp->srcu_have_cbs[i] = 0;
+                        snp->srcu_data_have_cbs[i] = 0;
+                }
+                snp->srcu_gp_seq_needed_exp = 0;
+                snp->grplo = -1;
+                snp->grphi = -1;
+                if (snp == &sp->node[0]) {
+                        /* Root node, special case. */
+                        snp->srcu_parent = NULL;
+                        continue;
+                }
+                /* Non-root node. */
+                if (snp == sp->level[level + 1])
+                        level++;
+                snp->srcu_parent = sp->level[level - 1] +
+                                   (snp - sp->level[level]) /
+                                   levelspread[level - 1];
+        }
+        /*
+         * Initialize the per-CPU srcu_data array, which feeds into the
+         * leaves of the srcu_node tree.
+         */
+        WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+                     ARRAY_SIZE(sdp->srcu_unlock_count));
+        level = rcu_num_lvls - 1;
+        snp_first = sp->level[level];
+        for_each_possible_cpu(cpu) {
+                sdp = per_cpu_ptr(sp->sda, cpu);
+                spin_lock_init(&sdp->lock);
+                rcu_segcblist_init(&sdp->srcu_cblist);
+                sdp->srcu_cblist_invoking = false;
+                sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
+                sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
+                sdp->mynode = &snp_first[cpu / levelspread[level]];
+                for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
+                        if (snp->grplo < 0)
+                                snp->grplo = cpu;
+                        snp->grphi = cpu;
+                }
+                sdp->cpu = cpu;
+                INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
+                sdp->sp = sp;
+                sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
+                if (is_static)
+                        continue;
+                /* Dynamically allocated, better be no srcu_read_locks()! */
+                for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
+                        sdp->srcu_lock_count[i] = 0;
+                        sdp->srcu_unlock_count[i] = 0;
+                }
+        }
+}
+/*
+ * Initialize non-compile-time initialized fields, including the
+ * associated srcu_node and srcu_data structures.  The is_static
+ * parameter is passed through to init_srcu_struct_nodes(), and
+ * also tells us that ->sda has already been wired up to srcu_data.
+ */
+static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
+{
+        mutex_init(&sp->srcu_cb_mutex);
+        mutex_init(&sp->srcu_gp_mutex);
+        sp->srcu_idx = 0;
+        sp->srcu_gp_seq = 0;
+        sp->srcu_barrier_seq = 0;
+        mutex_init(&sp->srcu_barrier_mutex);
+        atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
+        INIT_DELAYED_WORK(&sp->work, process_srcu);
+        if (!is_static)
+                sp->sda = alloc_percpu(struct srcu_data);
+        init_srcu_struct_nodes(sp, is_static);
+        sp->srcu_gp_seq_needed_exp = 0;
+        sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+        smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
+        return sp->sda ? 0 : -ENOMEM;
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+                       struct lock_class_key *key)
+{
+        /* Don't re-initialize a lock while it is held. */
+        debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+        lockdep_init_map(&sp->dep_map, name, key, 0);
+        spin_lock_init(&sp->gp_lock);
+        return init_srcu_struct_fields(sp, false);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+        spin_lock_init(&sp->gp_lock);
+        return init_srcu_struct_fields(sp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/*
+ * First-use initialization of statically allocated srcu_struct
+ * structure.  Wiring up the combining tree is more than can be
+ * done with compile-time initialization, so this check is added
+ * to each update-side SRCU primitive.  Use ->gp_lock, which -is-
+ * compile-time initialized, to resolve races involving multiple
+ * CPUs trying to garner first-use privileges.
+ */
+static void check_init_srcu_struct(struct srcu_struct *sp)
+{
+        unsigned long flags;
+        WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
+        /* The smp_load_acquire() pairs with the smp_store_release(). */
+        if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
+                return; /* Already initialized. */
+        spin_lock_irqsave(&sp->gp_lock, flags);
+        if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
+                spin_unlock_irqrestore(&sp->gp_lock, flags);
+                return;
+        }
+        init_srcu_struct_fields(sp, true);
+        spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+/*
+ * Returns approximate total of the readers' ->srcu_lock_count[] values
+ * for the rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
+{
+        int cpu;
+        unsigned long sum = 0;
+        for_each_possible_cpu(cpu) {
+                struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+                sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
+        }
+        return sum;
+}
+/*
+ * Returns approximate total of the readers' ->srcu_unlock_count[] values
+ * for the rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
+{
+        int cpu;
+        unsigned long sum = 0;
+        for_each_possible_cpu(cpu) {
+                struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+                sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
+        }
+        return sum;
+}
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be zero.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+        unsigned long unlocks;
+        unlocks = srcu_readers_unlock_idx(sp, idx);
+        /*
+         * Make sure that a lock is always counted if the corresponding
+         * unlock is counted. Needs to be a smp_mb() as the read side may
+         * contain a read from a variable that is written to before the
+         * synchronize_srcu() in the write side. In this case smp_mb()s
+         * A and B act like the store buffering pattern.
+         *
+         * This smp_mb() also pairs with smp_mb() C to prevent accesses
+         * after the synchronize_srcu() from being executed before the
+         * grace period ends.
+         */
+        smp_mb(); /* A */
+        /*
+         * If the locks are the same as the unlocks, then there must have
+         * been no readers on this index at some time in between. This does
+         * not mean that there are no more readers, as one could have read
+         * the current index but not have incremented the lock counter yet.
+         *
+         * Possible bug: There is no guarantee that there haven't been
+         * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
+         * counted, meaning that this could return true even if there are
+         * still active readers.  Since there are no memory barriers around
+         * srcu_flip(), the CPU is not required to increment ->srcu_idx
+         * before running srcu_readers_unlock_idx(), which means that there
+         * could be an arbitrarily large number of critical sections that
+         * execute after srcu_readers_unlock_idx() but use the old value
+         * of ->srcu_idx.
+         */
+        return srcu_readers_lock_idx(sp, idx) == unlocks;
+}
+/**
+ * srcu_readers_active - returns true if there are readers. and false
+ *                       otherwise
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct.  That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static bool srcu_readers_active(struct srcu_struct *sp)
+{
+        int cpu;
+        unsigned long sum = 0;
+        for_each_possible_cpu(cpu) {
+                struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+                sum += READ_ONCE(cpuc->srcu_lock_count[0]);
+                sum += READ_ONCE(cpuc->srcu_lock_count[1]);
+                sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
+                sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
+        }
+        return sum;
+}
+#define SRCU_INTERVAL           1
+/*
+ * Return grace-period delay, zero if there are expedited grace
+ * periods pending, SRCU_INTERVAL otherwise.
+ */
+static unsigned long srcu_get_delay(struct srcu_struct *sp)
+{
+        if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
+                         READ_ONCE(sp->srcu_gp_seq_needed_exp)))
+                return 0;
+        return SRCU_INTERVAL;
+}
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+        int cpu;
+        if (WARN_ON(!srcu_get_delay(sp)))
+                return; /* Leakage unless caller handles error. */
+        if (WARN_ON(srcu_readers_active(sp)))
+                return; /* Leakage unless caller handles error. */
+        flush_delayed_work(&sp->work);
+        for_each_possible_cpu(cpu)
+                flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+        if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+            WARN_ON(srcu_readers_active(sp))) {
+                pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+                return; /* Caller forgot to stop doing call_srcu()? */
+        }
+        free_percpu(sp->sda);
+        sp->sda = NULL;
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+        int idx;
+        idx = READ_ONCE(sp->srcu_idx) & 0x1;
+        __this_cpu_inc(sp->sda->srcu_lock_count[idx]);
+        smp_mb(); /* B */  /* Avoid leaking the critical section. */
+        return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct.  Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+        smp_mb(); /* C */  /* Avoid leaking the critical section. */
+        this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after a few microseconds,
+ * we repeatedly block for 1-millisecond time periods.
+ */
+#define SRCU_RETRY_CHECK_DELAY          5
+/*
+ * Start an SRCU grace period.
+ */
+static void srcu_gp_start(struct srcu_struct *sp)
+{
+        struct srcu_data *sdp = this_cpu_ptr(sp->sda);
+        int state;
+        RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
+                         "Invoked srcu_gp_start() without ->gp_lock!");
+        WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+        rcu_segcblist_advance(&sdp->srcu_cblist,
+                              rcu_seq_current(&sp->srcu_gp_seq));
+        (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+                                       rcu_seq_snap(&sp->srcu_gp_seq));
+        smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
+        rcu_seq_start(&sp->srcu_gp_seq);
+        state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+        WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
+}
+/*
+ * Track online CPUs to guide callback workqueue placement.
+ */
+DEFINE_PER_CPU(bool, srcu_online);
+void srcu_online_cpu(unsigned int cpu)
+{
+        WRITE_ONCE(per_cpu(srcu_online, cpu), true);
+}
+void srcu_offline_cpu(unsigned int cpu)
+{
+        WRITE_ONCE(per_cpu(srcu_online, cpu), false);
+}
+/*
+ * Place the workqueue handler on the specified CPU if online, otherwise
+ * just run it whereever.  This is useful for placing workqueue handlers
+ * that are to invoke the specified CPU's callbacks.
+ */
+static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+                                       struct delayed_work *dwork,
+                                       unsigned long delay)
+{
+        bool ret;
+        preempt_disable();
+        if (READ_ONCE(per_cpu(srcu_online, cpu)))
+                ret = queue_delayed_work_on(cpu, wq, dwork, delay);
+        else
+                ret = queue_delayed_work(wq, dwork, delay);
+        preempt_enable();
+        return ret;
+}
+/*
+ * Schedule callback invocation for the specified srcu_data structure,
+ * if possible, on the corresponding CPU.
+ */
+static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
+{
+        srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
+                                   &sdp->work, delay);
+}
+/*
+ * Schedule callback invocation for all srcu_data structures associated
+ * with the specified srcu_node structure that have callbacks for the
+ * just-completed grace period, the one corresponding to idx.  If possible,
+ * schedule this invocation on the corresponding CPUs.
+ */
+static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
+                                  unsigned long mask, unsigned long delay)
+{
+        int cpu;
+        for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+                if (!(mask & (1 << (cpu - snp->grplo))))
+                        continue;
+                srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
+        }
+}
+/*
+ * Note the end of an SRCU grace period.  Initiates callback invocation
+ * and starts a new grace period if needed.
+ *
+ * The ->srcu_cb_mutex acquisition does not protect any data, but
+ * instead prevents more than one grace period from starting while we
+ * are initiating callback invocation.  This allows the ->srcu_have_cbs[]
+ * array to have a finite number of elements.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+        unsigned long cbdelay;
+        bool cbs;
+        unsigned long gpseq;
+        int idx;
+        int idxnext;
+        unsigned long mask;
+        struct srcu_node *snp;
+        /* Prevent more than one additional grace period. */
+        mutex_lock(&sp->srcu_cb_mutex);
+        /* End the current grace period. */
+        spin_lock_irq(&sp->gp_lock);
+        idx = rcu_seq_state(sp->srcu_gp_seq);
+        WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
+        cbdelay = srcu_get_delay(sp);
+        sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+        rcu_seq_end(&sp->srcu_gp_seq);
+        gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+        if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
+                sp->srcu_gp_seq_needed_exp = gpseq;
+        spin_unlock_irq(&sp->gp_lock);
+        mutex_unlock(&sp->srcu_gp_mutex);
+        /* A new grace period can start at this point.  But only one. */
+        /* Initiate callback invocation as needed. */
+        idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+        idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
+        rcu_for_each_node_breadth_first(sp, snp) {
+                spin_lock_irq(&snp->lock);
+                cbs = false;
+                if (snp >= sp->level[rcu_num_lvls - 1])
+                        cbs = snp->srcu_have_cbs[idx] == gpseq;
+                snp->srcu_have_cbs[idx] = gpseq;
+                rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+                if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
+                        snp->srcu_gp_seq_needed_exp = gpseq;
+                mask = snp->srcu_data_have_cbs[idx];
+                snp->srcu_data_have_cbs[idx] = 0;
+                spin_unlock_irq(&snp->lock);
+                if (cbs) {
+                        smp_mb(); /* GP end before CB invocation. */
+                        srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
+                }
+        }
+        /* Callback initiation done, allow grace periods after next. */
+        mutex_unlock(&sp->srcu_cb_mutex);
+        /* Start a new grace period if needed. */
+        spin_lock_irq(&sp->gp_lock);
+        gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+        if (!rcu_seq_state(gpseq) &&
+            ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
+                srcu_gp_start(sp);
+                spin_unlock_irq(&sp->gp_lock);
+                /* Throttle expedited grace periods: Should be rare! */
+                srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
+                                    ? 0 : SRCU_INTERVAL);
+        } else {
+                spin_unlock_irq(&sp->gp_lock);
+        }
+}
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent expedited
+ * grace-period requests.  This function is invoked for the first known
+ * expedited request for a grace period that has already been requested,
+ * but without expediting.  To start a completely new grace period,
+ * whether expedited or not, use srcu_funnel_gp_start() instead.
+ */
+static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
+                                  unsigned long s)
+{
+        unsigned long flags;
+        for (; snp != NULL; snp = snp->srcu_parent) {
+                if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
+                    ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
+                        return;
+                spin_lock_irqsave(&snp->lock, flags);
+                if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+                        spin_unlock_irqrestore(&snp->lock, flags);
+                        return;
+                }
+                WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+                spin_unlock_irqrestore(&snp->lock, flags);
+        }
+        spin_lock_irqsave(&sp->gp_lock, flags);
+        if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+                sp->srcu_gp_seq_needed_exp = s;
+        spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent grace-period
+ * requests.  The winner has to do the work of actually starting grace
+ * period s.  Losers must either ensure that their desired grace-period
+ * number is recorded on at least their leaf srcu_node structure, or they
+ * must take steps to invoke their own callbacks.
+ */
+static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
+                                 unsigned long s, bool do_norm)
+{
+        unsigned long flags;
+        int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
+        struct srcu_node *snp = sdp->mynode;
+        unsigned long snp_seq;
+        /* Each pass through the loop does one level of the srcu_node tree. */
+        for (; snp != NULL; snp = snp->srcu_parent) {
+                if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
+                        return; /* GP already done and CBs recorded. */
+                spin_lock_irqsave(&snp->lock, flags);
+                if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+                        snp_seq = snp->srcu_have_cbs[idx];
+                        if (snp == sdp->mynode && snp_seq == s)
+                                snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+                        spin_unlock_irqrestore(&snp->lock, flags);
+                        if (snp == sdp->mynode && snp_seq != s) {
+                                smp_mb(); /* CBs after GP! */
+                                srcu_schedule_cbs_sdp(sdp, do_norm
+                                                           ? SRCU_INTERVAL
+                                                           : 0);
+                                return;
+                        }
+                        if (!do_norm)
+                                srcu_funnel_exp_start(sp, snp, s);
+                        return;
+                }
+                snp->srcu_have_cbs[idx] = s;
+                if (snp == sdp->mynode)
+                        snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+                if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
+                        snp->srcu_gp_seq_needed_exp = s;
+                spin_unlock_irqrestore(&snp->lock, flags);
+        }
+        /* Top of tree, must ensure the grace period will be started. */
+        spin_lock_irqsave(&sp->gp_lock, flags);
+        if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
+                /*
+                 * Record need for grace period s.  Pair with load
+                 * acquire setting up for initialization.
+                 */
+                smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
+        }
+        if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+                sp->srcu_gp_seq_needed_exp = s;
+        /* If grace period not already done and none in progress, start it. */
+        if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
+            rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
+                WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+                srcu_gp_start(sp);
+                queue_delayed_work(system_power_efficient_wq, &sp->work,
+                                   srcu_get_delay(sp));
+        }
+        spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+/*
+ * Wait until all readers counted by array index idx complete, but
+ * loop an additional time if there is an expedited grace period pending.
+ * The caller must ensure that ->srcu_idx is not changed while checking.
+ */
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+{
+        for (;;) {
+                if (srcu_readers_active_idx_check(sp, idx))
+                        return true;
+                if (--trycount + !srcu_get_delay(sp) <= 0)
+                        return false;
+                udelay(SRCU_RETRY_CHECK_DELAY);
+        }
+}
+/*
+ * Increment the ->srcu_idx counter so that future SRCU readers will
+ * use the other rank of the ->srcu_(un)lock_count[] arrays.  This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+        WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
+        /*
+         * Ensure that if the updater misses an __srcu_read_unlock()
+         * increment, that task's next __srcu_read_lock() will see the
+         * above counter update.  Note that both this memory barrier
+         * and the one in srcu_readers_active_idx_check() provide the
+         * guarantee for __srcu_read_lock().
+         */
+        smp_mb(); /* D */  /* Pairs with C. */
+}
+/*
+ * If SRCU is likely idle, return true, otherwise return false.
+ *
+ * Note that it is OK for several current from-idle requests for a new
+ * grace period from idle to specify expediting because they will all end
+ * up requesting the same grace period anyhow.  So no loss.
+ *
+ * Note also that if any CPU (including the current one) is still invoking
+ * callbacks, this function will nevertheless say "idle".  This is not
+ * ideal, but the overhead of checking all CPUs' callback lists is even
+ * less ideal, especially on large systems.  Furthermore, the wakeup
+ * can happen before the callback is fully removed, so we have no choice
+ * but to accept this type of error.
+ *
+ * This function is also subject to counter-wrap errors, but let's face
+ * it, if this function was preempted for enough time for the counters
+ * to wrap, it really doesn't matter whether or not we expedite the grace
+ * period.  The extra overhead of a needlessly expedited grace period is
+ * negligible when amoritized over that time period, and the extra latency
+ * of a needlessly non-expedited grace period is similarly negligible.
+ */
+static bool srcu_might_be_idle(struct srcu_struct *sp)
+{
+        unsigned long curseq;
+        unsigned long flags;
+        struct srcu_data *sdp;
+        unsigned long t;
+        /* If the local srcu_data structure has callbacks, not idle.  */
+        local_irq_save(flags);
+        sdp = this_cpu_ptr(sp->sda);
+        if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
+                local_irq_restore(flags);
+                return false; /* Callbacks already present, so not idle. */
+        }
+        local_irq_restore(flags);
+        /*
+         * No local callbacks, so probabalistically probe global state.
+         * Exact information would require acquiring locks, which would
+         * kill scalability, hence the probabalistic nature of the probe.
+         */
+        /* First, see if enough time has passed since the last GP. */
+        t = ktime_get_mono_fast_ns();
+        if (exp_holdoff == 0 ||
+            time_in_range_open(t, sp->srcu_last_gp_end,
+                               sp->srcu_last_gp_end + exp_holdoff))
+                return false; /* Too soon after last GP. */
+        /* Next, check for probable idleness. */
+        curseq = rcu_seq_current(&sp->srcu_gp_seq);
+        smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
+        if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
+                return false; /* Grace period in progress, so not idle. */
+        smp_mb(); /* Order ->srcu_gp_seq with prior access. */
+        if (curseq != rcu_seq_current(&sp->srcu_gp_seq))
+                return false; /* GP # changed, so not idle. */
+        return true; /* With reasonable probability, idle! */
+}
+/*
+ * Enqueue an SRCU callback on the srcu_data structure associated with
+ * the current CPU and the specified srcu_struct structure, initiating
+ * grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section.  On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu().  It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
+ */
+void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+                 rcu_callback_t func, bool do_norm)
+{
+        unsigned long flags;
+        bool needexp = false;
+        bool needgp = false;
+        unsigned long s;
+        struct srcu_data *sdp;
+        check_init_srcu_struct(sp);
+        rhp->func = func;
+        local_irq_save(flags);
+        sdp = this_cpu_ptr(sp->sda);
+        spin_lock(&sdp->lock);
+        rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
+        rcu_segcblist_advance(&sdp->srcu_cblist,
+                              rcu_seq_current(&sp->srcu_gp_seq));
+        s = rcu_seq_snap(&sp->srcu_gp_seq);
+        (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+        if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+                sdp->srcu_gp_seq_needed = s;
+                needgp = true;
+        }
+        if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+                sdp->srcu_gp_seq_needed_exp = s;
+                needexp = true;
+        }
+        spin_unlock_irqrestore(&sdp->lock, flags);
+        if (needgp)
+                srcu_funnel_gp_start(sp, sdp, s, do_norm);
+        else if (needexp)
+                srcu_funnel_exp_start(sp, sdp->mynode, s);
+}
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+               rcu_callback_t func)
+{
+        __call_srcu(sp, rhp, func, true);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
+{
+        struct rcu_synchronize rcu;
+        RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+                         lock_is_held(&rcu_bh_lock_map) ||
+                         lock_is_held(&rcu_lock_map) ||
+                         lock_is_held(&rcu_sched_lock_map),
+                         "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+        if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+                return;
+        might_sleep();
+        check_init_srcu_struct(sp);
+        init_completion(&rcu.completion);
+        init_rcu_head_on_stack(&rcu.head);
+        __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+        __synchronize_srcu(sp, rcu_gp_is_normal());
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
+ * and then flip the srcu_idx and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu().  In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section.  Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu().  This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
+ *
+ * If SRCU is likely idle, expedite the first request.  This semantic
+ * was provided by Classic SRCU, and is relied upon by its users, so TREE
+ * SRCU must also provide it.  Note that detecting idleness is heuristic
+ * and subject to both false positives and negatives.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+        if (srcu_might_be_idle(sp) || rcu_gp_is_expedited())
+                synchronize_srcu_expedited(sp);
+        else
+                __synchronize_srcu(sp, true);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+/*
+ * Callback function for srcu_barrier() use.
+ */
+static void srcu_barrier_cb(struct rcu_head *rhp)
+{
+        struct srcu_data *sdp;
+        struct srcu_struct *sp;
+        sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
+        sp = sdp->sp;
+        if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+                complete(&sp->srcu_barrier_completion);
+}
+/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+        int cpu;
+        struct srcu_data *sdp;
+        unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
+        check_init_srcu_struct(sp);
+        mutex_lock(&sp->srcu_barrier_mutex);
+        if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
+                smp_mb(); /* Force ordering following return. */
+                mutex_unlock(&sp->srcu_barrier_mutex);
+                return; /* Someone else did our work for us. */
+        }
+        rcu_seq_start(&sp->srcu_barrier_seq);
+        init_completion(&sp->srcu_barrier_completion);
+        /* Initial count prevents reaching zero until all CBs are posted. */
+        atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
+        /*
+         * Each pass through this loop enqueues a callback, but only
+         * on CPUs already having callbacks enqueued.  Note that if
+         * a CPU already has callbacks enqueue, it must have already
+         * registered the need for a future grace period, so all we
+         * need do is enqueue a callback that will use the same
+         * grace period as the last callback already in the queue.
+         */
+        for_each_possible_cpu(cpu) {
+                sdp = per_cpu_ptr(sp->sda, cpu);
+                spin_lock_irq(&sdp->lock);
+                atomic_inc(&sp->srcu_barrier_cpu_cnt);
+                sdp->srcu_barrier_head.func = srcu_barrier_cb;
+                if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+                                           &sdp->srcu_barrier_head, 0))
+                        atomic_dec(&sp->srcu_barrier_cpu_cnt);
+                spin_unlock_irq(&sdp->lock);
+        }
+        /* Remove the initial count, at which point reaching zero can happen. */
+        if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+                complete(&sp->srcu_barrier_completion);
+        wait_for_completion(&sp->srcu_barrier_completion);
+        rcu_seq_end(&sp->srcu_barrier_seq);
+        mutex_unlock(&sp->srcu_barrier_mutex);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+        return sp->srcu_idx;
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+/*
+ * Core SRCU state machine.  Push state bits of ->srcu_gp_seq
+ * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
+ * completed in that state.
+ */
+static void srcu_advance_state(struct srcu_struct *sp)
+{
+        int idx;
+        mutex_lock(&sp->srcu_gp_mutex);
+        /*
+         * Because readers might be delayed for an extended period after
+         * fetching ->srcu_idx for their index, at any point in time there
+         * might well be readers using both idx=0 and idx=1.  We therefore
+         * need to wait for readers to clear from both index values before
+         * invoking a callback.
+         *
+         * The load-acquire ensures that we see the accesses performed
+         * by the prior grace period.
+         */
+        idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+        if (idx == SRCU_STATE_IDLE) {
+                spin_lock_irq(&sp->gp_lock);
+                if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+                        WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
+                        spin_unlock_irq(&sp->gp_lock);
+                        mutex_unlock(&sp->srcu_gp_mutex);
+                        return;
+                }
+                idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+                if (idx == SRCU_STATE_IDLE)
+                        srcu_gp_start(sp);
+                spin_unlock_irq(&sp->gp_lock);
+                if (idx != SRCU_STATE_IDLE) {
+                        mutex_unlock(&sp->srcu_gp_mutex);
+                        return; /* Someone else started the grace period. */
+                }
+        }
+        if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+                idx = 1 ^ (sp->srcu_idx & 1);
+                if (!try_check_zero(sp, idx, 1)) {
+                        mutex_unlock(&sp->srcu_gp_mutex);
+                        return; /* readers present, retry later. */
+                }
+                srcu_flip(sp);
+                rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
+        }
+        if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+                /*
+                 * SRCU read-side critical sections are normally short,
+                 * so check at least twice in quick succession after a flip.
+                 */
+                idx = 1 ^ (sp->srcu_idx & 1);
+                if (!try_check_zero(sp, idx, 2)) {
+                        mutex_unlock(&sp->srcu_gp_mutex);
+                        return; /* readers present, retry later. */
+                }
+                srcu_gp_end(sp);  /* Releases ->srcu_gp_mutex. */
+        }
+}
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period.  If there are more to do, SRCU will reschedule
+ * the workqueue.  Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
+ */
+static void srcu_invoke_callbacks(struct work_struct *work)
+{
+        bool more;
+        struct rcu_cblist ready_cbs;
+        struct rcu_head *rhp;
+        struct srcu_data *sdp;
+        struct srcu_struct *sp;
+        sdp = container_of(work, struct srcu_data, work.work);
+        sp = sdp->sp;
+        rcu_cblist_init(&ready_cbs);
+        spin_lock_irq(&sdp->lock);
+        smp_mb(); /* Old grace periods before callback invocation! */
+        rcu_segcblist_advance(&sdp->srcu_cblist,
+                              rcu_seq_current(&sp->srcu_gp_seq));
+        if (sdp->srcu_cblist_invoking ||
+            !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
+                spin_unlock_irq(&sdp->lock);
+                return;  /* Someone else on the job or nothing to do. */
+        }
+        /* We are on the job!  Extract and invoke ready callbacks. */
+        sdp->srcu_cblist_invoking = true;
+        rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+        spin_unlock_irq(&sdp->lock);
+        rhp = rcu_cblist_dequeue(&ready_cbs);
+        for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+                local_bh_disable();
+                rhp->func(rhp);
+                local_bh_enable();
+        }
+        /*
+         * Update counts, accelerate new callbacks, and if needed,
+         * schedule another round of callback invocation.
+         */
+        spin_lock_irq(&sdp->lock);
+        rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
+        (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+                                       rcu_seq_snap(&sp->srcu_gp_seq));
+        sdp->srcu_cblist_invoking = false;
+        more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
+        spin_unlock_irq(&sdp->lock);
+        if (more)
+                srcu_schedule_cbs_sdp(sdp, 0);
+}
+/*
+ * Finished one round of SRCU grace period.  Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+{
+        bool pushgp = true;
+        spin_lock_irq(&sp->gp_lock);
+        if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+                if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
+                        /* All requests fulfilled, time to go idle. */
+                        pushgp = false;
+                }
+        } else if (!rcu_seq_state(sp->srcu_gp_seq)) {
+                /* Outstanding request and no GP.  Start one. */
+                srcu_gp_start(sp);
+        }
+        spin_unlock_irq(&sp->gp_lock);
+        if (pushgp)
+                queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+}
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+void process_srcu(struct work_struct *work)
+{
+        struct srcu_struct *sp;
+        sp = container_of(work, struct srcu_struct, work.work);
+        srcu_advance_state(sp);
+        srcu_reschedule(sp, srcu_get_delay(sp));
+}
+EXPORT_SYMBOL_GPL(process_srcu);
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+                             struct srcu_struct *sp, int *flags,
+                             unsigned long *gpnum, unsigned long *completed)
+{
+        if (test_type != SRCU_FLAVOR)
+                return;
+        *flags = 0;
+        *completed = rcu_seq_ctr(sp->srcu_gp_seq);
+        *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
+}
+EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 6ad330dbbae2..e5385731e391 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
 */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
-        RCU_TRACE(reset_cpu_stall_ticks(rcp));
+        RCU_TRACE(reset_cpu_stall_ticks(rcp);)
        if (rcp->donetail != rcp->curtail) {
                rcp->donetail = rcp->curtail;
                return 1;
@@ -125,7 +125,7 @@ void rcu_bh_qs(void)
 */
 void rcu_check_callbacks(int user)
 {
-        RCU_TRACE(check_cpu_stalls());
+        RCU_TRACE(check_cpu_stalls();)
        if (user)
                rcu_sched_qs();
        else if (!in_softirq())
@@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        const char *rn = NULL;
        struct rcu_head *next, *list;
        unsigned long flags;
-        RCU_TRACE(int cb_count = 0);
+        RCU_TRACE(int cb_count = 0;)
        /* Move the ready-to-invoke callbacks to a local list. */
        local_irq_save(flags);
@@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                local_irq_restore(flags);
                return;
        }
-        RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
+        RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
        list = rcp->rcucblist;
        rcp->rcucblist = *rcp->donetail;
        *rcp->donetail = NULL;
@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        local_irq_restore(flags);
        /* Invoke the callbacks on the local list. */
-        RCU_TRACE(rn = rcp->name);
+        RCU_TRACE(rn = rcp->name;)
        while (list) {
                next = list->next;
                prefetch(next);
@@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                __rcu_reclaim(rn, list);
                local_bh_enable();
                list = next;
-                RCU_TRACE(cb_count++);
+                RCU_TRACE(cb_count++;)
        }
-        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
        RCU_TRACE(trace_rcu_batch_end(rcp->name,
                                      cb_count, 0, need_resched(),
                                      is_idle_task(current),
@@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head,
        local_irq_save(flags);
        *rcp->curtail = head;
        rcp->curtail = &head->next;
-        RCU_TRACE(rcp->qlen++);
+        RCU_TRACE(rcp->qlen++;)
        local_irq_restore(flags);
        if (unlikely(is_idle_task(current))) {
@@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 void __init rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-        RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+        RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
-        RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
+        RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
        rcu_early_boot_tests();
 }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index c64b827ecbca..371034e77f87 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        RCU_TRACE(.name = "rcu_bh")
 };
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
 #include <linux/kernel_stat.h>
 int rcu_scheduler_active __read_mostly;
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
 * The reason for this is that Tiny RCU does not need kthreads, so does
 * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process.
+ * at a certain phase of the boot process.  Unless SRCU is in the mix.
 */
 void __init rcu_scheduler_starting(void)
 {
        WARN_ON(nr_context_switches() > 0);
-        rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+        rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
+                ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
 }
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
 #ifdef CONFIG_RCU_TRACE
@@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
 static void check_cpu_stalls(void)
 {
-        RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+        RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
-        RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+        RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
 }
 #endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 50fee7689e71..e354e475e645 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,7 @@
 #include <linux/random.h>
 #include <linux/trace_events.h>
 #include <linux/suspend.h>
+#include <linux/ftrace.h>
 #include "tree.h"
 #include "rcu.h"
@@ -97,8 +98,8 @@ struct rcu_state sname##_state = { \
        .gpnum = 0UL - 300UL, \
        .completed = 0UL - 300UL, \
        .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
-        .orphan_nxttail = &sname##_state.orphan_nxtlist, \
+        .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
-        .orphan_donetail = &sname##_state.orphan_donelist, \
+        .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
        .name = RCU_STATE_NAME(sname), \
        .abbr = sabbr, \
@@ -123,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
 module_param(rcu_fanout_leaf, int, 0444);
 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 /* Number of rcu_nodes at specified level. */
-static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
+int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 /* panic() on RCU Stall sysctl. */
 int sysctl_panic_on_rcu_stall __read_mostly;
@@ -199,7 +200,7 @@ static const int gp_cleanup_delay;
 /*
 * Number of grace periods between delays, normalized by the duration of
- * the delay.  The longer the the delay, the more the grace periods between
+ * the delay.  The longer the delay, the more the grace periods between
 * each delay.  The reason for this normalization is that it means that,
 * for non-zero delays, the overall slowdown of grace periods is constant
 * regardless of the duration of the delay.  This arrangement balances
@@ -272,11 +273,19 @@ void rcu_bh_qs(void)
        }
 }
-static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+/*
+ * Steal a bit from the bottom of ->dynticks for idle entry/exit
+ * control.  Initially this is for TLB flushing.
+ */
+#define RCU_DYNTICK_CTRL_MASK 0x1
+#define RCU_DYNTICK_CTRL_CTR  (RCU_DYNTICK_CTRL_MASK + 1)
+#ifndef rcu_eqs_special_exit
+#define rcu_eqs_special_exit() do { } while (0)
+#endif
 static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
-        .dynticks = ATOMIC_INIT(1),
+        .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
        .dynticks_idle = ATOMIC_INIT(1),
@@ -284,21 +293,40 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 };
 /*
+ * There's a few places, currently just in the tracing infrastructure,
+ * that uses rcu_irq_enter() to make sure RCU is watching. But there's
+ * a small location where that will not even work. In those cases
+ * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter()
+ * can be called.
+ */
+static DEFINE_PER_CPU(bool, disable_rcu_irq_enter);
+bool rcu_irq_enter_disabled(void)
+{
+        return this_cpu_read(disable_rcu_irq_enter);
+}
+/*
 * Record entry into an extended quiescent state.  This is only to be
 * called when not already in an extended quiescent state.
 */
 static void rcu_dynticks_eqs_enter(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        int special;
+        int seq;
        /*
-         * CPUs seeing atomic_inc_return() must see prior RCU read-side
+         * CPUs seeing atomic_add_return() must see prior RCU read-side
         * critical sections, and we also must force ordering with the
         * next idle sojourn.
         */
-        special = atomic_inc_return(&rdtp->dynticks);
+        seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
-        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
+        /* Better be in an extended quiescent state! */
+        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+                     (seq & RCU_DYNTICK_CTRL_CTR));
+        /* Better not have special action (TLB flush) pending! */
+        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+                     (seq & RCU_DYNTICK_CTRL_MASK));
 }
 /*
@@ -308,15 +336,22 @@ static void rcu_dynticks_eqs_enter(void)
 static void rcu_dynticks_eqs_exit(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        int special;
+        int seq;
        /*
-         * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+         * CPUs seeing atomic_add_return() must see prior idle sojourns,
         * and we also must force ordering with the next RCU read-side
         * critical section.
         */
-        special = atomic_inc_return(&rdtp->dynticks);
+        seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
-        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+                     !(seq & RCU_DYNTICK_CTRL_CTR));
+        if (seq & RCU_DYNTICK_CTRL_MASK) {
+                atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
+                smp_mb__after_atomic(); /* _exit after clearing mask. */
+                /* Prefer duplicate flushes to losing a flush. */
+                rcu_eqs_special_exit();
+        }
 }
 /*
@@ -333,9 +368,9 @@ static void rcu_dynticks_eqs_online(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        if (atomic_read(&rdtp->dynticks) & 0x1)
+        if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
                return;
-        atomic_add(0x1, &rdtp->dynticks);
+        atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
 }
 /*
@@ -347,7 +382,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        return !(atomic_read(&rdtp->dynticks) & 0x1);
+        return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
 }
 /*
@@ -358,7 +393,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 {
        int snap = atomic_add_return(0, &rdtp->dynticks);
-        return snap;
+        return snap & ~RCU_DYNTICK_CTRL_MASK;
 }
 /*
@@ -367,7 +402,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 */
 static bool rcu_dynticks_in_eqs(int snap)
 {
-        return !(snap & 0x1);
+        return !(snap & RCU_DYNTICK_CTRL_CTR);
 }
 /*
@@ -387,14 +422,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
 static void rcu_dynticks_momentary_idle(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        int special = atomic_add_return(2, &rdtp->dynticks);
+        int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
+                                        &rdtp->dynticks);
        /* It is illegal to call this from idle state. */
-        WARN_ON_ONCE(!(special & 0x1));
+        WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
 }
-DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+/*
-EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+ * Set the special (bottom) bit of the specified CPU so that it
+ * will take special action (such as flushing its TLB) on the
+ * next exit from an extended quiescent state.  Returns true if
+ * the bit was successfully set, or false if the CPU was not in
+ * an extended quiescent state.
+ */
+bool rcu_eqs_special_set(int cpu)
+{
+        int old;
+        int new;
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        do {
+                old = atomic_read(&rdtp->dynticks);
+                if (old & RCU_DYNTICK_CTRL_CTR)
+                        return false;
+                new = old | RCU_DYNTICK_CTRL_MASK;
+        } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
+        return true;
+}
 /*
 * Let the RCU core know that this CPU has gone through the scheduler,
@@ -403,44 +458,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
 * memory barriers to let the RCU core know about it, regardless of what
 * this CPU might (or might not) do in the near future.
 *
- * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * We inform the RCU core by emulating a zero-duration dyntick-idle period.
- * period, which we in turn do by incrementing the ->dynticks counter
- * by two.
 *
 * The caller must have disabled interrupts.
 */
 static void rcu_momentary_dyntick_idle(void)
 {
-        struct rcu_data *rdp;
+        raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false);
-        int resched_mask;
+        rcu_dynticks_momentary_idle();
-        struct rcu_state *rsp;
-        /*
-         * Yes, we can lose flag-setting operations.  This is OK, because
-         * the flag will be set again after some delay.
-         */
-        resched_mask = raw_cpu_read(rcu_sched_qs_mask);
-        raw_cpu_write(rcu_sched_qs_mask, 0);
-        /* Find the flavor that needs a quiescent state. */
-        for_each_rcu_flavor(rsp) {
-                rdp = raw_cpu_ptr(rsp->rda);
-                if (!(resched_mask & rsp->flavor_mask))
-                        continue;
-                smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
-                if (READ_ONCE(rdp->mynode->completed) !=
-                    READ_ONCE(rdp->cond_resched_completed))
-                        continue;
-                /*
-                 * Pretend to be momentarily idle for the quiescent state.
-                 * This allows the grace-period kthread to record the
-                 * quiescent state, with no need for this CPU to do anything
-                 * further.
-                 */
-                rcu_dynticks_momentary_idle();
-                break;
-        }
 }
 /*
@@ -448,14 +473,22 @@ static void rcu_momentary_dyntick_idle(void)
 * and requires special handling for preemptible RCU.
 * The caller must have disabled interrupts.
 */
-void rcu_note_context_switch(void)
+void rcu_note_context_switch(bool preempt)
 {
        barrier(); /* Avoid RCU read-side critical sections leaking down. */
        trace_rcu_utilization(TPS("Start context switch"));
        rcu_sched_qs();
        rcu_preempt_note_context_switch();
-        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+        /* Load rcu_urgent_qs before other flags. */
+        if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
+                goto out;
+        this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
+        if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
                rcu_momentary_dyntick_idle();
+        this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
+        if (!preempt)
+                rcu_note_voluntary_context_switch_lite(current);
+out:
        trace_rcu_utilization(TPS("End context switch"));
        barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
@@ -478,29 +511,26 @@ void rcu_all_qs(void)
 {
        unsigned long flags;
+        if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs))
+                return;
+        preempt_disable();
+        /* Load rcu_urgent_qs before other flags. */
+        if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
+                preempt_enable();
+                return;
+        }
+        this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
        barrier(); /* Avoid RCU read-side critical sections leaking down. */
-        if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+        if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) {
                local_irq_save(flags);
                rcu_momentary_dyntick_idle();
                local_irq_restore(flags);
        }
-        if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
+        if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)))
-                /*
-                 * Yes, we just checked a per-CPU variable with preemption
-                 * enabled, so we might be migrated to some other CPU at
-                 * this point.  That is OK because in that case, the
-                 * migration will supply the needed quiescent state.
-                 * We might end up needlessly disabling preemption and
-                 * invoking rcu_sched_qs() on the destination CPU, but
-                 * the probability and cost are both quite low, so this
-                 * should not be a problem in practice.
-                 */
-                preempt_disable();
                rcu_sched_qs();
-                preempt_enable();
+        this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
-        }
-        this_cpu_inc(rcu_qs_ctr);
        barrier(); /* Avoid RCU read-side critical sections leaking up. */
+        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -689,15 +719,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
        default:
                break;
        }
-        if (rsp != NULL) {
+        if (rsp == NULL)
-                *flags = READ_ONCE(rsp->gp_flags);
-                *gpnum = READ_ONCE(rsp->gpnum);
-                *completed = READ_ONCE(rsp->completed);
                return;
-        }
+        *flags = READ_ONCE(rsp->gp_flags);
-        *flags = 0;
+        *gpnum = READ_ONCE(rsp->gpnum);
-        *gpnum = 0;
+        *completed = READ_ONCE(rsp->completed);
-        *completed = 0;
 }
 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
@@ -713,16 +739,6 @@ void rcutorture_record_progress(unsigned long vernum)
 EXPORT_SYMBOL_GPL(rcutorture_record_progress);
 /*
- * Does the CPU have callbacks ready to be invoked?
- */
-static int
-cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
-{
-        return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
-               rdp->nxttail[RCU_NEXT_TAIL] != NULL;
-}
-/*
 * Return the root node of the specified rcu_state structure.
 */
 static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
@@ -752,44 +768,39 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
 static bool
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        int i;
        if (rcu_gp_in_progress(rsp))
                return false;  /* No, a grace period is already in progress. */
        if (rcu_future_needs_gp(rsp))
                return true;  /* Yes, a no-CBs CPU needs one. */
-        if (!rdp->nxttail[RCU_NEXT_TAIL])
+        if (!rcu_segcblist_is_enabled(&rdp->cblist))
                return false;  /* No, this is a no-CBs (or offline) CPU. */
-        if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+        if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
                return true;  /* Yes, CPU has newly registered callbacks. */
-        for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+        if (rcu_segcblist_future_gp_needed(&rdp->cblist,
-                if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+                                           READ_ONCE(rsp->completed)))
-                    ULONG_CMP_LT(READ_ONCE(rsp->completed),
+                return true;  /* Yes, CBs for future grace period. */
-                                 rdp->nxtcompleted[i]))
-                        return true;  /* Yes, CBs for future grace period. */
        return false; /* No grace period needed. */
 }
 /*
- * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
+ * rcu_eqs_enter_common - current CPU is entering an extended quiescent state
 *
- * If the new value of the ->dynticks_nesting counter now is zero,
+ * Enter idle, doing appropriate accounting.  The caller must have
- * we really have entered idle, and must do the appropriate accounting.
+ * disabled interrupts.
- * The caller must have disabled interrupts.
 */
-static void rcu_eqs_enter_common(long long oldval, bool user)
+static void rcu_eqs_enter_common(bool user)
 {
        struct rcu_state *rsp;
        struct rcu_data *rdp;
-        RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
+        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
+        trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
        if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
            !user && !is_idle_task(current)) {
                struct task_struct *idle __maybe_unused =
                        idle_task(smp_processor_id());
-                trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
+                trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0);
                rcu_ftrace_dump(DUMP_ORIG);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
                          current->pid, current->comm,
@@ -800,7 +811,10 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
                do_nocb_deferred_wakeup(rdp);
        }
        rcu_prepare_for_idle();
-        rcu_dynticks_eqs_enter();
+        __this_cpu_inc(disable_rcu_irq_enter);
+        rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */
+        rcu_dynticks_eqs_enter(); /* After this, tracing works again. */
+        __this_cpu_dec(disable_rcu_irq_enter);
        rcu_dynticks_task_enter();
        /*
@@ -821,19 +835,15 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 */
 static void rcu_eqs_enter(bool user)
 {
-        long long oldval;
        struct rcu_dynticks *rdtp;
        rdtp = this_cpu_ptr(&rcu_dynticks);
-        oldval = rdtp->dynticks_nesting;
        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-                     (oldval & DYNTICK_TASK_NEST_MASK) == 0);
+                     (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
-        if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
+        if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
-                rdtp->dynticks_nesting = 0;
+                rcu_eqs_enter_common(user);
-                rcu_eqs_enter_common(oldval, user);
+        else
-        } else {
                rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
-        }
 }
 /**
@@ -892,19 +902,18 @@ void rcu_user_enter(void)
 */
 void rcu_irq_exit(void)
 {
-        long long oldval;
        struct rcu_dynticks *rdtp;
        RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
        rdtp = this_cpu_ptr(&rcu_dynticks);
-        oldval = rdtp->dynticks_nesting;
-        rdtp->dynticks_nesting--;
        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-                     rdtp->dynticks_nesting < 0);
+                     rdtp->dynticks_nesting < 1);
-        if (rdtp->dynticks_nesting)
+        if (rdtp->dynticks_nesting <= 1) {
-                trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
+                rcu_eqs_enter_common(true);
-        else
+        } else {
-                rcu_eqs_enter_common(oldval, true);
+                trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
+                rdtp->dynticks_nesting--;
+        }
        rcu_sysidle_enter(1);
 }
@@ -1150,6 +1159,24 @@ bool notrace rcu_is_watching(void)
 }
 EXPORT_SYMBOL_GPL(rcu_is_watching);
+/*
+ * If a holdout task is actually running, request an urgent quiescent
+ * state from its CPU.  This is unsynchronized, so migrations can cause
+ * the request to go to the wrong CPU.  Which is OK, all that will happen
+ * is that the CPU's next context switch will be a bit slower and next
+ * time around this task will generate another request.
+ */
+void rcu_request_urgent_qs_task(struct task_struct *t)
+{
+        int cpu;
+        barrier();
+        cpu = task_cpu(t);
+        if (!task_curr(t))
+                return; /* This task is not running on that CPU. */
+        smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
+}
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 /*
@@ -1235,7 +1262,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
                                    bool *isidle, unsigned long *maxj)
 {
        unsigned long jtsq;
-        int *rcrmp;
+        bool *rnhqp;
+        bool *ruqp;
        unsigned long rjtsc;
        struct rcu_node *rnp;
@@ -1271,11 +1299,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
         * might not be the case for nohz_full CPUs looping in the kernel.
         */
        rnp = rdp->mynode;
+        ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
        if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
-            READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) &&
+            READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
            READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
                return 1;
+        } else {
+                /* Load rcu_qs_ctr before store to rcu_urgent_qs. */
+                smp_store_release(ruqp, true);
        }
        /* Check for the CPU being offline. */
@@ -1292,7 +1324,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
         * in-kernel CPU-bound tasks cannot advance grace periods.
         * So if the grace period is old enough, make the CPU pay attention.
         * Note that the unsynchronized assignments to the per-CPU
-         * rcu_sched_qs_mask variable are safe.  Yes, setting of
+         * rcu_need_heavy_qs variable are safe.  Yes, setting of
         * bits can be lost, but they will be set again on the next
         * force-quiescent-state pass.  So lost bit sets do not result
         * in incorrect behavior, merely in a grace period lasting
@@ -1306,16 +1338,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
         * is set too high, we override with half of the RCU CPU stall
         * warning delay.
         */
-        rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+        rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
-        if (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
+        if (!READ_ONCE(*rnhqp) &&
-            time_after(jiffies, rdp->rsp->jiffies_resched)) {
+            (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
-                if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+             time_after(jiffies, rdp->rsp->jiffies_resched))) {
-                        WRITE_ONCE(rdp->cond_resched_completed,
+                WRITE_ONCE(*rnhqp, true);
-                                   READ_ONCE(rdp->mynode->completed));
+                /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
-                        smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+                smp_store_release(ruqp, true);
-                        WRITE_ONCE(*rcrmp,
-                                   READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
-                }
                rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
        }
@@ -1475,7 +1504,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
        print_cpu_stall_info_end();
        for_each_possible_cpu(cpu)
-                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+                totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
+                                                            cpu)->cblist);
        pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start),
               (long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1529,7 +1559,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
        print_cpu_stall_info(rsp, smp_processor_id());
        print_cpu_stall_info_end();
        for_each_possible_cpu(cpu)
-                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+                totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
+                                                            cpu)->cblist);
        pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
                jiffies - rsp->gp_start,
                (long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1632,30 +1663,6 @@ void rcu_cpu_stall_reset(void)
 }
 /*
- * Initialize the specified rcu_data structure's default callback list
- * to empty.  The default callback list is the one that is not used by
- * no-callbacks CPUs.
- */
-static void init_default_callback_list(struct rcu_data *rdp)
-{
-        int i;
-        rdp->nxtlist = NULL;
-        for (i = 0; i < RCU_NEXT_SIZE; i++)
-                rdp->nxttail[i] = &rdp->nxtlist;
-}
-/*
- * Initialize the specified rcu_data structure's callback list to empty.
- */
-static void init_callback_list(struct rcu_data *rdp)
-{
-        if (init_nocb_callback_list(rdp))
-                return;
-        init_default_callback_list(rdp);
-}
-/*
 * Determine the value that ->completed will have at the end of the
 * next subsequent grace period.  This is used to tag callbacks so that
 * a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1709,7 +1716,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
                    unsigned long *c_out)
 {
        unsigned long c;
-        int i;
        bool ret = false;
        struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
@@ -1755,13 +1761,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
        /*
         * Get a new grace-period number.  If there really is no grace
         * period in progress, it will be smaller than the one we obtained
-         * earlier.  Adjust callbacks as needed.  Note that even no-CBs
+         * earlier.  Adjust callbacks as needed.
-         * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
         */
        c = rcu_cbs_completed(rdp->rsp, rnp_root);
-        for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+        if (!rcu_is_nocb_cpu(rdp->cpu))
-                if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+                (void)rcu_segcblist_accelerate(&rdp->cblist, c);
-                        rdp->nxtcompleted[i] = c;
        /*
         * If the needed for the required grace period is already
@@ -1793,9 +1797,7 @@ out:
 /*
 * Clean up any old requests for the just-ended grace period.  Also return
- * whether any additional grace periods have been requested.  Also invoke
+ * whether any additional grace periods have been requested.
- * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
- * waiting for this grace period to complete.
 */
 static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 {
@@ -1841,57 +1843,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
                               struct rcu_data *rdp)
 {
-        unsigned long c;
+        bool ret = false;
-        int i;
-        bool ret;
-        /* If the CPU has no callbacks, nothing to do. */
-        if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
-                return false;
-        /*
-         * Starting from the sublist containing the callbacks most
-         * recently assigned a ->completed number and working down, find the
-         * first sublist that is not assignable to an upcoming grace period.
-         * Such a sublist has something in it (first two tests) and has
-         * a ->completed number assigned that will complete sooner than
-         * the ->completed number for newly arrived callbacks (last test).
-         *
-         * The key point is that any later sublist can be assigned the
-         * same ->completed number as the newly arrived callbacks, which
-         * means that the callbacks in any of these later sublist can be
-         * grouped into a single sublist, whether or not they have already
-         * been assigned a ->completed number.
-         */
-        c = rcu_cbs_completed(rsp, rnp);
-        for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
-                if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
-                    !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
-                        break;
-        /*
+        /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
-         * If there are no sublist for unassigned callbacks, leave.
+        if (!rcu_segcblist_pend_cbs(&rdp->cblist))
-         * At the same time, advance "i" one sublist, so that "i" will
-         * index into the sublist where all the remaining callbacks should
-         * be grouped into.
-         */
-        if (++i >= RCU_NEXT_TAIL)
                return false;
        /*
-         * Assign all subsequent callbacks' ->completed number to the next
+         * Callbacks are often registered with incomplete grace-period
-         * full grace period and group them all in the sublist initially
+         * information.  Something about the fact that getting exact
-         * indexed by "i".
+         * information requires acquiring a global lock...  RCU therefore
+         * makes a conservative estimate of the grace period number at which
+         * a given callback will become ready to invoke.        The following
+         * code checks this estimate and improves it when possible, thus
+         * accelerating callback invocation to an earlier grace-period
+         * number.
         */
-        for (; i <= RCU_NEXT_TAIL; i++) {
+        if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
-                rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+                ret = rcu_start_future_gp(rnp, rdp, NULL);
-                rdp->nxtcompleted[i] = c;
-        }
-        /* Record any needed additional grace periods. */
-        ret = rcu_start_future_gp(rnp, rdp, NULL);
        /* Trace depending on how much we were able to accelerate. */
-        if (!*rdp->nxttail[RCU_WAIT_TAIL])
+        if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
        else
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
@@ -1911,32 +1883,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
                            struct rcu_data *rdp)
 {
-        int i, j;
+        /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
+        if (!rcu_segcblist_pend_cbs(&rdp->cblist))
-        /* If the CPU has no callbacks, nothing to do. */
-        if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
                return false;
        /*
         * Find all callbacks whose ->completed numbers indicate that they
         * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
         */
-        for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+        rcu_segcblist_advance(&rdp->cblist, rnp->completed);
-                if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
-                        break;
-                rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
-        }
-        /* Clean up any sublist tail pointers that were misordered above. */
-        for (j = RCU_WAIT_TAIL; j < i; j++)
-                rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
-        /* Copy down callbacks to fill in empty sublists. */
-        for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
-                if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
-                        break;
-                rdp->nxttail[j] = rdp->nxttail[i];
-                rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
-        }
        /* Classify any remaining callbacks. */
        return rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1981,7 +1936,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
                need_gp = !!(rnp->qsmask & rdp->grpmask);
                rdp->cpu_no_qs.b.norm = need_gp;
-                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
                rdp->core_needs_qs = need_gp;
                zero_cpu_stall_ticks(rdp);
                WRITE_ONCE(rdp->gpwrap, false);
@@ -2579,7 +2534,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * within the current grace period.
                 */
                rdp->cpu_no_qs.b.norm = true;   /* need qs for new gp. */
-                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
@@ -2653,13 +2608,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
         * because _rcu_barrier() excludes CPU-hotplug operations, so it
         * cannot be running now.  Thus no memory barrier is required.
         */
-        if (rdp->nxtlist != NULL) {
+        rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
-                rsp->qlen_lazy += rdp->qlen_lazy;
+        rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-                rsp->qlen += rdp->qlen;
-                rdp->n_cbs_orphaned += rdp->qlen;
-                rdp->qlen_lazy = 0;
-                WRITE_ONCE(rdp->qlen, 0);
-        }
        /*
         * Next, move those callbacks still needing a grace period to
@@ -2667,31 +2617,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
         * Some of the callbacks might have gone partway through a grace
         * period, but that is too bad.  They get to start over because we
         * cannot assume that grace periods are synchronized across CPUs.
-         * We don't bother updating the ->nxttail[] array yet, instead
-         * we just reset the whole thing later on.
         */
-        if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
+        rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-                *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
-                rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
-                *rdp->nxttail[RCU_DONE_TAIL] = NULL;
-        }
        /*
         * Then move the ready-to-invoke callbacks to the orphanage,
         * where some other CPU will pick them up.  These will not be
         * required to pass though another grace period: They are done.
         */
-        if (rdp->nxtlist != NULL) {
+        rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-                *rsp->orphan_donetail = rdp->nxtlist;
-                rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
-        }
-        /*
+        /* Finally, disallow further callbacks on this CPU.  */
-         * Finally, initialize the rcu_data structure's list to empty and
+        rcu_segcblist_disable(&rdp->cblist);
-         * disallow further callbacks on this CPU.
-         */
-        init_callback_list(rdp);
-        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 }
 /*
@@ -2700,7 +2637,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 */
 static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
 {
-        int i;
        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
        /* No-CBs CPUs are handled specially. */
@@ -2709,13 +2645,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
                return;
        /* Do the accounting first. */
-        rdp->qlen_lazy += rsp->qlen_lazy;
+        rdp->n_cbs_adopted += rsp->orphan_done.len;
-        rdp->qlen += rsp->qlen;
+        if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
-        rdp->n_cbs_adopted += rsp->qlen;
-        if (rsp->qlen_lazy != rsp->qlen)
                rcu_idle_count_callbacks_posted();
-        rsp->qlen_lazy = 0;
+        rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-        rsp->qlen = 0;
        /*
         * We do not need a memory barrier here because the only way we
@@ -2723,24 +2656,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
         * we are the task doing the rcu_barrier().
         */
-        /* First adopt the ready-to-invoke callbacks. */
+        /* First adopt the ready-to-invoke callbacks, then the done ones. */
-        if (rsp->orphan_donelist != NULL) {
+        rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
-                *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
+        WARN_ON_ONCE(rsp->orphan_done.head);
-                *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
+        rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-                for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
+        WARN_ON_ONCE(rsp->orphan_pend.head);
-                        if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+        WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
-                                rdp->nxttail[i] = rsp->orphan_donetail;
+                     !rcu_segcblist_n_cbs(&rdp->cblist));
-                rsp->orphan_donelist = NULL;
-                rsp->orphan_donetail = &rsp->orphan_donelist;
-        }
-        /* And then adopt the callbacks that still need a grace period. */
-        if (rsp->orphan_nxtlist != NULL) {
-                *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
-                rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
-                rsp->orphan_nxtlist = NULL;
-                rsp->orphan_nxttail = &rsp->orphan_nxtlist;
-        }
 }
 /*
@@ -2748,14 +2670,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
 */
 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
-        RCU_TRACE(unsigned long mask);
+        RCU_TRACE(unsigned long mask;)
-        RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
+        RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);)
-        RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+        RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return;
-        RCU_TRACE(mask = rdp->grpmask);
+        RCU_TRACE(mask = rdp->grpmask;)
        trace_rcu_grace_period(rsp->name,
                               rnp->gpnum + 1 - !!(rnp->qsmask & mask),
                               TPS("cpuofl"));
@@ -2828,9 +2750,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        rcu_adopt_orphan_cbs(rsp, flags);
        raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+        WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
-                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+                  !rcu_segcblist_empty(&rdp->cblist),
-                  cpu, rdp->qlen, rdp->nxtlist);
+                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+                  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+                  rcu_segcblist_first_cb(&rdp->cblist));
 }
 /*
@@ -2840,14 +2764,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
-        struct rcu_head *next, *list, **tail;
+        struct rcu_head *rhp;
-        long bl, count, count_lazy;
+        struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
-        int i;
+        long bl, count;
        /* If no callbacks are ready, just return. */
-        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+        if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
-                trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
+                trace_rcu_batch_start(rsp->name,
-                trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
+                                      rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+                                      rcu_segcblist_n_cbs(&rdp->cblist), 0);
+                trace_rcu_batch_end(rsp->name, 0,
+                                    !rcu_segcblist_empty(&rdp->cblist),
                                    need_resched(), is_idle_task(current),
                                    rcu_is_callbacks_kthread());
                return;
@@ -2855,73 +2782,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /*
         * Extract the list of ready callbacks, disabling to prevent
-         * races with call_rcu() from interrupt handlers.
+         * races with call_rcu() from interrupt handlers.  Leave the
+         * callback counts, as rcu_barrier() needs to be conservative.
         */
        local_irq_save(flags);
        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
        bl = rdp->blimit;
-        trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
+        trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist),
-        list = rdp->nxtlist;
+                              rcu_segcblist_n_cbs(&rdp->cblist), bl);
-        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+        rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
-        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
-        tail = rdp->nxttail[RCU_DONE_TAIL];
-        for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
-                if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
-                        rdp->nxttail[i] = &rdp->nxtlist;
        local_irq_restore(flags);
        /* Invoke callbacks. */
-        count = count_lazy = 0;
+        rhp = rcu_cblist_dequeue(&rcl);
-        while (list) {
+        for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
-                next = list->next;
+                debug_rcu_head_unqueue(rhp);
-                prefetch(next);
+                if (__rcu_reclaim(rsp->name, rhp))
-                debug_rcu_head_unqueue(list);
+                        rcu_cblist_dequeued_lazy(&rcl);
-                if (__rcu_reclaim(rsp->name, list))
+                /*
-                        count_lazy++;
+                 * Stop only if limit reached and CPU has something to do.
-                list = next;
+                 * Note: The rcl structure counts down from zero.
-                /* Stop only if limit reached and CPU has something to do. */
+                 */
-                if (++count >= bl &&
+                if (-rcl.len >= bl &&
                    (need_resched() ||
                     (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                        break;
        }
        local_irq_save(flags);
-        trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
+        count = -rcl.len;
-                            is_idle_task(current),
+        trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(),
-                            rcu_is_callbacks_kthread());
+                            is_idle_task(current), rcu_is_callbacks_kthread());
-        /* Update count, and requeue any remaining callbacks. */
+        /* Update counts and requeue any remaining callbacks. */
-        if (list != NULL) {
+        rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
-                *tail = rdp->nxtlist;
-                rdp->nxtlist = list;
-                for (i = 0; i < RCU_NEXT_SIZE; i++)
-                        if (&rdp->nxtlist == rdp->nxttail[i])
-                                rdp->nxttail[i] = tail;
-                        else
-                                break;
-        }
        smp_mb(); /* List handling before counting for rcu_barrier(). */
-        rdp->qlen_lazy -= count_lazy;
-        WRITE_ONCE(rdp->qlen, rdp->qlen - count);
        rdp->n_cbs_invoked += count;
+        rcu_segcblist_insert_count(&rdp->cblist, &rcl);
        /* Reinstate batch limit if we have worked down the excess. */
-        if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+        count = rcu_segcblist_n_cbs(&rdp->cblist);
+        if (rdp->blimit == LONG_MAX && count <= qlowmark)
                rdp->blimit = blimit;
        /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
-        if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+        if (count == 0 && rdp->qlen_last_fqs_check != 0) {
                rdp->qlen_last_fqs_check = 0;
                rdp->n_force_qs_snap = rsp->n_force_qs;
-        } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
+        } else if (count < rdp->qlen_last_fqs_check - qhimark)
-                rdp->qlen_last_fqs_check = rdp->qlen;
+                rdp->qlen_last_fqs_check = count;
-        WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
+        WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
        local_irq_restore(flags);
        /* Re-invoke RCU core processing if there are callbacks remaining. */
-        if (cpu_has_callbacks_ready_to_invoke(rdp))
+        if (rcu_segcblist_ready_cbs(&rdp->cblist))
                invoke_rcu_core();
 }
@@ -3087,7 +3002,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        bool needwake;
        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-        WARN_ON_ONCE(rdp->beenonline == 0);
+        WARN_ON_ONCE(!rdp->beenonline);
        /* Update RCU state based on any recent quiescent states. */
        rcu_check_quiescent_state(rsp, rdp);
@@ -3105,7 +3020,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        }
        /* If there are callbacks ready, invoke them. */
-        if (cpu_has_callbacks_ready_to_invoke(rdp))
+        if (rcu_segcblist_ready_cbs(&rdp->cblist))
                invoke_rcu_callbacks(rsp, rdp);
        /* Do any needed deferred wakeups of rcuo kthreads. */
@@ -3177,7 +3092,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
         * invoking force_quiescent_state() if the newly enqueued callback
         * is the only one waiting for a grace period to complete.
         */
-        if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+        if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
+                     rdp->qlen_last_fqs_check + qhimark)) {
                /* Are we ignoring a completed grace period? */
                note_gp_changes(rsp, rdp);
@@ -3195,10 +3111,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
                        /* Give the grace period a kick. */
                        rdp->blimit = LONG_MAX;
                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-                            *rdp->nxttail[RCU_DONE_TAIL] != head)
+                            rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
                                force_quiescent_state(rsp);
                        rdp->n_force_qs_snap = rsp->n_force_qs;
-                        rdp->qlen_last_fqs_check = rdp->qlen;
+                        rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
                }
        }
 }
@@ -3238,7 +3154,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
        rdp = this_cpu_ptr(rsp->rda);
        /* Add the callback to our list. */
-        if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
+        if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
                int offline;
                if (cpu != -1)
@@ -3257,23 +3173,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
                 */
                BUG_ON(cpu != -1);
                WARN_ON_ONCE(!rcu_is_watching());
-                if (!likely(rdp->nxtlist))
+                if (rcu_segcblist_empty(&rdp->cblist))
-                        init_default_callback_list(rdp);
+                        rcu_segcblist_init(&rdp->cblist);
        }
-        WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
+        rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
-        if (lazy)
+        if (!lazy)
-                rdp->qlen_lazy++;
-        else
                rcu_idle_count_callbacks_posted();
-        smp_mb();  /* Count before adding callback for rcu_barrier(). */
-        *rdp->nxttail[RCU_NEXT_TAIL] = head;
-        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
        if (__is_kfree_rcu_offset((unsigned long)func))
                trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
-                                         rdp->qlen_lazy, rdp->qlen);
+                                         rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+                                         rcu_segcblist_n_cbs(&rdp->cblist));
        else
-                trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
+                trace_rcu_callback(rsp->name, head,
+                                   rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+                                   rcu_segcblist_n_cbs(&rdp->cblist));
        /* Go handle any RCU core processing required. */
        __call_rcu_core(rsp, rdp, head, flags);
@@ -3519,41 +3433,6 @@ void cond_synchronize_sched(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_sched);
-/* Adjust sequence number for start of update-side operation. */
-static void rcu_seq_start(unsigned long *sp)
-{
-        WRITE_ONCE(*sp, *sp + 1);
-        smp_mb(); /* Ensure update-side operation after counter increment. */
-        WARN_ON_ONCE(!(*sp & 0x1));
-}
-/* Adjust sequence number for end of update-side operation. */
-static void rcu_seq_end(unsigned long *sp)
-{
-        smp_mb(); /* Ensure update-side operation before counter increment. */
-        WRITE_ONCE(*sp, *sp + 1);
-        WARN_ON_ONCE(*sp & 0x1);
-}
-/* Take a snapshot of the update side's sequence number. */
-static unsigned long rcu_seq_snap(unsigned long *sp)
-{
-        unsigned long s;
-        s = (READ_ONCE(*sp) + 3) & ~0x1;
-        smp_mb(); /* Above access must not bleed into critical section. */
-        return s;
-}
-/*
- * Given a snapshot from rcu_seq_snap(), determine whether or not a
- * full update-side operation has occurred.
- */
-static bool rcu_seq_done(unsigned long *sp, unsigned long s)
-{
-        return ULONG_CMP_GE(READ_ONCE(*sp), s);
-}
 /*
 * Check to see if there is any immediate RCU-related work to be done
 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -3577,7 +3456,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Is the RCU core waiting for a quiescent state from this CPU? */
        if (rcu_scheduler_fully_active &&
            rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
-            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
+            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
                rdp->n_rp_core_needs_qs++;
        } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
                rdp->n_rp_report_qs++;
@@ -3585,7 +3464,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* Does this CPU have callbacks ready to invoke? */
-        if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+        if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
                rdp->n_rp_cb_ready++;
                return 1;
        }
@@ -3649,10 +3528,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
        for_each_rcu_flavor(rsp) {
                rdp = this_cpu_ptr(rsp->rda);
-                if (!rdp->nxtlist)
+                if (rcu_segcblist_empty(&rdp->cblist))
                        continue;
                hc = true;
-                if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
+                if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) {
                        al = false;
                        break;
                }
@@ -3761,7 +3640,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
                                __call_rcu(&rdp->barrier_head,
                                           rcu_barrier_callback, rsp, cpu, 0);
                        }
-                } else if (READ_ONCE(rdp->qlen)) {
+                } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
                                           rsp->barrier_sequence);
                        smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3870,8 +3749,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
-        if (!rdp->nxtlist)
+        if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
-                init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
+            !init_nocb_callback_list(rdp))
+                rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
        rcu_sysidle_init_percpu_data(rdp->dynticks);
        rcu_dynticks_eqs_online();
@@ -3890,12 +3770,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
        rdp->completed = rnp->completed;
        rdp->cpu_no_qs.b.norm = true;
-        rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
+        rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
        rdp->core_needs_qs = false;
        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
+/*
+ * Invoked early in the CPU-online process, when pretty much all
+ * services are available.  The incoming CPU is not present.
+ */
 int rcutree_prepare_cpu(unsigned int cpu)
 {
        struct rcu_state *rsp;
@@ -3909,6 +3793,9 @@ int rcutree_prepare_cpu(unsigned int cpu)
        return 0;
 }
+/*
+ * Update RCU priority boot kthread affinity for CPU-hotplug changes.
+ */
 static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
 {
        struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
@@ -3916,20 +3803,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
        rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
 }
+/*
+ * Near the end of the CPU-online process.  Pretty much all services
+ * enabled, and the CPU is now very much alive.
+ */
 int rcutree_online_cpu(unsigned int cpu)
 {
        sync_sched_exp_online_cleanup(cpu);
        rcutree_affinity_setting(cpu, -1);
+        if (IS_ENABLED(CONFIG_TREE_SRCU))
+                srcu_online_cpu(cpu);
        return 0;
 }
+/*
+ * Near the beginning of the process.  The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
 int rcutree_offline_cpu(unsigned int cpu)
 {
        rcutree_affinity_setting(cpu, cpu);
+        if (IS_ENABLED(CONFIG_TREE_SRCU))
+                srcu_offline_cpu(cpu);
        return 0;
 }
+/*
+ * Near the end of the offline process.  We do only tracing here.
+ */
 int rcutree_dying_cpu(unsigned int cpu)
 {
        struct rcu_state *rsp;
@@ -3939,6 +3840,9 @@ int rcutree_dying_cpu(unsigned int cpu)
        return 0;
 }
+/*
+ * The outgoing CPU is gone and we are running elsewhere.
+ */
 int rcutree_dead_cpu(unsigned int cpu)
 {
        struct rcu_state *rsp;
@@ -3956,6 +3860,10 @@ int rcutree_dead_cpu(unsigned int cpu)
 * incoming CPUs are not allowed to use RCU read-side critical sections
 * until this function is called.  Failing to observe this restriction
 * will result in lockdep splats.
+ *
+ * Note that this function is special in that it is invoked directly
+ * from the incoming CPU rather than from the cpuhp_step mechanism.
+ * This is because this function must be invoked at a precise location.
 */
 void rcu_cpu_starting(unsigned int cpu)
 {
@@ -3981,9 +3889,6 @@ void rcu_cpu_starting(unsigned int cpu)
 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
 * function.  We now remove it from the rcu_node tree's ->qsmaskinit
 * bit masks.
- * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
- * function.  We now remove it from the rcu_node tree's ->qsmaskinit
- * bit masks.
 */
 static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 {
@@ -3999,6 +3904,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
+/*
+ * The outgoing function has no further need of RCU, so remove it from
+ * the list of CPUs that RCU must track.
+ *
+ * Note that this function is special in that it is invoked directly
+ * from the outgoing CPU rather than from the cpuhp_step mechanism.
+ * This is because this function must be invoked at a precise location.
+ */
 void rcu_report_dead(unsigned int cpu)
 {
        struct rcu_state *rsp;
@@ -4013,6 +3926,10 @@ void rcu_report_dead(unsigned int cpu)
 }
 #endif
+/*
+ * On non-huge systems, use expedited RCU grace periods to make suspend
+ * and hibernation run faster.
+ */
 static int rcu_pm_notify(struct notifier_block *self,
                         unsigned long action, void *hcpu)
 {
@@ -4083,7 +4000,7 @@ early_initcall(rcu_spawn_gp_kthread);
 * task is booting the system, and such primitives are no-ops).  After this
 * function is called, any synchronous grace-period primitives are run as
 * expedited, with the requesting task driving the grace period forward.
- * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * A later core_initcall() rcu_set_runtime_mode() will switch to full
 * runtime RCU functionality.
 */
 void rcu_scheduler_starting(void)
@@ -4096,31 +4013,6 @@ void rcu_scheduler_starting(void)
 }
 /*
- * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
- */
-static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
-{
-        int i;
-        if (rcu_fanout_exact) {
-                levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
-                for (i = rcu_num_lvls - 2; i >= 0; i--)
-                        levelspread[i] = RCU_FANOUT;
-        } else {
-                int ccur;
-                int cprv;
-                cprv = nr_cpu_ids;
-                for (i = rcu_num_lvls - 1; i >= 0; i--) {
-                        ccur = levelcnt[i];
-                        levelspread[i] = (cprv + ccur - 1) / ccur;
-                        cprv = ccur;
-                }
-        }
-}
-/*
 * Helper function for rcu_init() that initializes one rcu_state structure.
 */
 static void __init rcu_init_one(struct rcu_state *rsp)
@@ -4129,9 +4021,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
        static const char * const fqs[] = RCU_FQS_NAME_INIT;
        static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
        static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-        static u8 fl_mask = 0x1;
-        int levelcnt[RCU_NUM_LVLS];             /* # nodes in each level. */
        int levelspread[RCU_NUM_LVLS];          /* kids/node in each level. */
        int cpustride = 1;
        int i;
@@ -4146,20 +4036,16 @@ static void __init rcu_init_one(struct rcu_state *rsp)
        /* Initialize the level-tracking arrays. */
-        for (i = 0; i < rcu_num_lvls; i++)
-                levelcnt[i] = num_rcu_lvl[i];
        for (i = 1; i < rcu_num_lvls; i++)
-                rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
+                rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1];
-        rcu_init_levelspread(levelspread, levelcnt);
+        rcu_init_levelspread(levelspread, num_rcu_lvl);
-        rsp->flavor_mask = fl_mask;
-        fl_mask <<= 1;
        /* Initialize the elements themselves, starting from the leaves. */
        for (i = rcu_num_lvls - 1; i >= 0; i--) {
                cpustride *= levelspread[i];
                rnp = rsp->level[i];
-                for (j = 0; j < levelcnt[i]; j++, rnp++) {
+                for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
                        raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
                        lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
                                                   &rcu_node_class[i], buf[i]);
@@ -4332,6 +4218,8 @@ void __init rcu_init(void)
        for_each_online_cpu(cpu) {
                rcutree_prepare_cpu(cpu);
                rcu_cpu_starting(cpu);
+                if (IS_ENABLED(CONFIG_TREE_SRCU))
+                        srcu_online_cpu(cpu);
        }
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ec62a05bfdb3..ba38262c3554 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,80 +30,9 @@
 #include <linux/seqlock.h>
 #include <linux/swait.h>
 #include <linux/stop_machine.h>
+#include <linux/rcu_node_tree.h>
-/*
+#include "rcu_segcblist.h"
- * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
- * CONFIG_RCU_FANOUT_LEAF.
- * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this did work well going from three levels to four.
- * Of course, your mileage may vary.
- */
-#ifdef CONFIG_RCU_FANOUT
-#define RCU_FANOUT CONFIG_RCU_FANOUT
-#else /* #ifdef CONFIG_RCU_FANOUT */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT 64
-# else
-# define RCU_FANOUT 32
-# endif
-#endif /* #else #ifdef CONFIG_RCU_FANOUT */
-#ifdef CONFIG_RCU_FANOUT_LEAF
-#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
-#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT_LEAF 64
-# else
-# define RCU_FANOUT_LEAF 32
-# endif
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
-#define RCU_FANOUT_1          (RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2          (RCU_FANOUT_1 * RCU_FANOUT)
-#define RCU_FANOUT_3          (RCU_FANOUT_2 * RCU_FANOUT)
-#define RCU_FANOUT_4          (RCU_FANOUT_3 * RCU_FANOUT)
-#if NR_CPUS <= RCU_FANOUT_1
-#  define RCU_NUM_LVLS        1
-#  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_NODES       NUM_RCU_LVL_0
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
-#elif NR_CPUS <= RCU_FANOUT_2
-#  define RCU_NUM_LVLS        2
-#  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES       (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-#elif NR_CPUS <= RCU_FANOUT_3
-#  define RCU_NUM_LVLS        3
-#  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES       (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-#elif NR_CPUS <= RCU_FANOUT_4
-#  define RCU_NUM_LVLS        4
-#  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES       (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-#else
-# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
 /*
 * Dynticks per-CPU state.
@@ -113,6 +42,9 @@ struct rcu_dynticks {
                                    /* Process level is worth LLONG_MAX/2. */
        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
        atomic_t dynticks;          /* Even value for idle, else odd. */
+        bool rcu_need_heavy_qs;     /* GP old, need heavy quiescent state. */
+        unsigned long rcu_qs_ctr;   /* Light universal quiescent state ctr. */
+        bool rcu_urgent_qs;         /* GP old need light quiescent state. */
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
        long long dynticks_idle_nesting;
                                    /* irq/process nesting level from idle. */
@@ -262,41 +194,6 @@ struct rcu_node {
 #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
 /*
- * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
- */
-#define rcu_for_each_node_breadth_first(rsp, rnp) \
-        for ((rnp) = &(rsp)->node[0]; \
-             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
-/*
- * Do a breadth-first scan of the non-leaf rcu_node structures for the
- * specified rcu_state structure.  Note that if there is a singleton
- * rcu_node tree with but one rcu_node structure, this loop is a no-op.
- */
-#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
-        for ((rnp) = &(rsp)->node[0]; \
-             (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
-/*
- * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
- * structure.  Note that if there is a singleton rcu_node tree with but
- * one rcu_node structure, this loop -will- visit the rcu_node structure.
- * It is still a leaf node, even if it is also the root node.
- */
-#define rcu_for_each_leaf_node(rsp, rnp) \
-        for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
-             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
-/*
- * Iterate over all possible CPUs in a leaf RCU node.
- */
-#define for_each_leaf_node_possible_cpu(rnp, cpu) \
-        for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
-             cpu <= rnp->grphi; \
-             cpu = cpumask_next((cpu), cpu_possible_mask))
-/*
 * Union to allow "aggregate OR" operation on the need for a quiescent
 * state by the normal and expedited grace periods.
 */
@@ -336,34 +233,9 @@ struct rcu_data {
                                        /* period it is aware of. */
        /* 2) batch handling */
-        /*
+        struct rcu_segcblist cblist;    /* Segmented callback list, with */
-         * If nxtlist is not NULL, it is partitioned as follows.
+                                        /* different callbacks waiting for */
-         * Any of the partitions might be empty, in which case the
+                                        /* different grace periods. */
-         * pointer to that partition will be equal to the pointer for
-         * the following partition.  When the list is empty, all of
-         * the nxttail elements point to the ->nxtlist pointer itself,
-         * which in that case is NULL.
-         *
-         * [nxtlist, *nxttail[RCU_DONE_TAIL]):
-         *      Entries that batch # <= ->completed
-         *      The grace period for these entries has completed, and
-         *      the other grace-period-completed entries may be moved
-         *      here temporarily in rcu_process_callbacks().
-         * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
-         *      Entries that batch # <= ->completed - 1: waiting for current GP
-         * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
-         *      Entries known to have arrived before current GP ended
-         * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
-         *      Entries that might have arrived after current GP ended
-         *      Note that the value of *nxttail[RCU_NEXT_TAIL] will
-         *      always be NULL, as this is the end of the list.
-         */
-        struct rcu_head *nxtlist;
-        struct rcu_head **nxttail[RCU_NEXT_SIZE];
-        unsigned long   nxtcompleted[RCU_NEXT_SIZE];
-                                        /* grace periods for sublists. */
-        long            qlen_lazy;      /* # of lazy queued callbacks */
-        long            qlen;           /* # of queued callbacks, incl lazy */
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
@@ -482,7 +354,6 @@ struct rcu_state {
        struct rcu_node *level[RCU_NUM_LVLS + 1];
                                                /* Hierarchy levels (+1 to */
                                                /*  shut bogus gcc warning) */
-        u8 flavor_mask;                         /* bit in flavor mask. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        call_rcu_func_t call;                   /* call_rcu() flavor. */
        int ncpus;                              /* # CPUs seen so far. */
@@ -502,14 +373,11 @@ struct rcu_state {
        raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
                                                /* Protect following fields. */
-        struct rcu_head *orphan_nxtlist;        /* Orphaned callbacks that */
+        struct rcu_cblist orphan_pend;          /* Orphaned callbacks that */
                                                /*  need a grace period. */
-        struct rcu_head **orphan_nxttail;       /* Tail of above. */
+        struct rcu_cblist orphan_done;          /* Orphaned callbacks that */
-        struct rcu_head *orphan_donelist;       /* Orphaned callbacks that */
                                                /*  are ready to invoke. */
-        struct rcu_head **orphan_donetail;      /* Tail of above. */
+                                                /* (Contains counts.) */
-        long qlen_lazy;                         /* Number of lazy callbacks. */
-        long qlen;                              /* Total number of callbacks. */
        /* End of fields guarded by orphan_lock. */
        struct mutex barrier_mutex;             /* Guards barrier fields. */
@@ -596,6 +464,7 @@ extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+bool rcu_eqs_special_set(int cpu);
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -673,6 +542,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
 static void rcu_dynticks_task_enter(void);
 static void rcu_dynticks_task_exit(void);
+#ifdef CONFIG_SRCU
+void srcu_online_cpu(unsigned int cpu);
+void srcu_offline_cpu(unsigned int cpu);
+#else /* #ifdef CONFIG_SRCU */
+void srcu_online_cpu(unsigned int cpu) { }
+void srcu_offline_cpu(unsigned int cpu) { }
+#endif /* #else #ifdef CONFIG_SRCU */
 #endif /* #ifndef RCU_TREE_NONCORE */
 #ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a7b639ccd46e..e513b4ab1197 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
                        trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
                                                  rnp->grplo, rnp->grphi,
                                                  TPS("wait"));
-                        wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+                        wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
                                   sync_exp_work_done(rsp,
                                                      &rdp->exp_workdone2, s));
                        return true;
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data)
                return;
        }
        __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+        /* Store .exp before .rcu_urgent_qs. */
+        smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
        resched_cpu(smp_processor_id());
 }
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
                                rnp->exp_seq_rq = s;
                        spin_unlock(&rnp->exp_lock);
                }
-                wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+                smp_mb(); /* All above changes before wakeup. */
+                wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]);
        }
        trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
        mutex_unlock(&rsp->exp_wake_mutex);
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
        /* Wait for expedited grace period to complete. */
        rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
        rnp = rcu_get_root(rsp);
-        wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+        wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
-                   sync_exp_work_done(rsp,
+                   sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
-                                      &rdp->exp_workdone0, s));
+        smp_mb(); /* Workqueue actions happen before return. */
        /* Let the next expedited grace period start. */
        mutex_unlock(&rsp->exp_mutex);
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void)
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-/*
- * Switch to run-time mode once Tree RCU has fully initialized.
- */
-static int __init rcu_exp_runtime_mode(void)
-{
-        rcu_test_sync_prims();
-        rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
-        rcu_test_sync_prims();
-        return 0;
-}
-core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a62a8f1caac..c9a48657512a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
                 */
                if ((rdp->completed != rnp->completed ||
                     unlikely(READ_ONCE(rdp->gpwrap))) &&
-                    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+                    rcu_segcblist_pend_cbs(&rdp->cblist))
                        note_gp_changes(rsp, rdp);
-                if (cpu_has_callbacks_ready_to_invoke(rdp))
+                if (rcu_segcblist_ready_cbs(&rdp->cblist))
                        cbs_ready = true;
        }
        return cbs_ready;
@@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void)
        rdtp->last_accelerate = jiffies;
        for_each_rcu_flavor(rsp) {
                rdp = this_cpu_ptr(rsp->rda);
-                if (!*rdp->nxttail[RCU_DONE_TAIL])
+                if (rcu_segcblist_pend_cbs(&rdp->cblist))
                        continue;
                rnp = rdp->mynode;
                raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused)
        for_each_rcu_flavor(rsp) {
                rdp = raw_cpu_ptr(rsp->rda);
-                if (rdp->qlen_lazy != 0) {
+                if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
                        atomic_inc(&oom_callback_count);
                        rsp->call(&rdp->oom_head, rcu_oom_callback);
                }
@@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup);
 static int __init parse_rcu_nocb_poll(char *arg)
 {
-        rcu_nocb_poll = 1;
+        rcu_nocb_poll = true;
        return 0;
 }
 early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
@@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeEmpty"));
                } else {
-                        rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
+                        WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
+                        /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+                        smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeEmptyIsDeferred"));
                }
@@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeOvf"));
                } else {
-                        rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
+                        WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
+                        /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+                        smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeOvfIsDeferred"));
                }
@@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
                                                     struct rcu_data *rdp,
                                                     unsigned long flags)
 {
-        long ql = rsp->qlen;
+        long ql = rsp->orphan_done.len;
-        long qll = rsp->qlen_lazy;
+        long qll = rsp->orphan_done.len_lazy;
        /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
        if (!rcu_is_nocb_cpu(smp_processor_id()))
                return false;
-        rsp->qlen = 0;
-        rsp->qlen_lazy = 0;
        /* First, enqueue the donelist, if any.  This preserves CB ordering. */
-        if (rsp->orphan_donelist != NULL) {
+        if (rsp->orphan_done.head) {
-                __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
+                __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
-                                        rsp->orphan_donetail, ql, qll, flags);
+                                        rcu_cblist_tail(&rsp->orphan_done),
-                ql = qll = 0;
+                                        ql, qll, flags);
-                rsp->orphan_donelist = NULL;
-                rsp->orphan_donetail = &rsp->orphan_donelist;
        }
-        if (rsp->orphan_nxtlist != NULL) {
+        if (rsp->orphan_pend.head) {
-                __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
+                __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
-                                        rsp->orphan_nxttail, ql, qll, flags);
+                                        rcu_cblist_tail(&rsp->orphan_pend),
-                ql = qll = 0;
+                                        ql, qll, flags);
-                rsp->orphan_nxtlist = NULL;
-                rsp->orphan_nxttail = &rsp->orphan_nxtlist;
        }
+        rcu_cblist_init(&rsp->orphan_done);
+        rcu_cblist_init(&rsp->orphan_pend);
        return true;
 }
@@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
                return false;
        /* If there are early-boot callbacks, move them to nocb lists. */
-        if (rdp->nxtlist) {
+        if (!rcu_segcblist_empty(&rdp->cblist)) {
-                rdp->nocb_head = rdp->nxtlist;
+                rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
-                rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
+                rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
-                atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
+                atomic_long_set(&rdp->nocb_q_count,
-                atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
+                                rcu_segcblist_n_cbs(&rdp->cblist));
-                rdp->nxtlist = NULL;
+                atomic_long_set(&rdp->nocb_q_count_lazy,
-                rdp->qlen = 0;
+                                rcu_segcblist_n_lazy_cbs(&rdp->cblist));
-                rdp->qlen_lazy = 0;
+                rcu_segcblist_init(&rdp->cblist);
        }
-        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+        rcu_segcblist_disable(&rdp->cblist);
        return true;
 }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 8751a748499a..6cea17a1ea30 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -41,11 +41,11 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/prefetch.h>
 #define RCU_TREE_NONCORE
 #include "tree.h"
+#include "rcu.h"
-DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
 static int r_open(struct inode *inode, struct file *file,
                                        const struct seq_operations *op)
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
                   rdp->cpu_no_qs.b.norm,
-                   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+                   rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
                   rdp->core_needs_qs);
        seq_printf(m, " dt=%d/%llx/%d df=%lu",
                   rcu_dynticks_snap(rdp->dynticks),
@@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks_fqs);
        seq_printf(m, " of=%lu", rdp->offline_fqs);
        rcu_nocb_q_lengths(rdp, &ql, &qll);
-        qll += rdp->qlen_lazy;
+        qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
-        ql += rdp->qlen;
+        ql += rcu_segcblist_n_cbs(&rdp->cblist);
        seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
                   qll, ql,
-                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+                   ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
-                        rdp->nxttail[RCU_NEXT_TAIL]],
+                   ".R"[!rcu_segcblist_segempty(&rdp->cblist,
-                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+                                                RCU_NEXT_READY_TAIL)],
-                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
+                   ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
-                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+                   ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
-                        rdp->nxttail[RCU_WAIT_TAIL]],
-                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
 #ifdef CONFIG_RCU_BOOST
        seq_printf(m, " kt=%d/%c ktl=%x",
                   per_cpu(rcu_cpu_has_work, rdp->cpu),
@@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                   READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+                   READ_ONCE(rsp->n_force_qs_lh),
+                   rsp->orphan_done.len_lazy,
+                   rsp->orphan_done.len);
        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 55c8530316c7..273e869ca21d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
 * non-expedited counterparts?  Intended for use within RCU.  Note
 * that if the user specifies both rcu_expedited and rcu_normal, then
 * rcu_normal wins.  (Except during the time period during boot from
- * when the first task is spawned until the rcu_exp_runtime_mode()
+ * when the first task is spawned until the rcu_set_runtime_mode()
 * core_initcall() is invoked, at which point everything is expedited.)
 */
 bool rcu_gp_is_normal(void)
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void)
 #endif /* #ifndef CONFIG_TINY_RCU */
+/*
+ * Test each non-SRCU synchronous grace-period wait API.  This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+        if (!IS_ENABLED(CONFIG_PROVE_RCU))
+                return;
+        synchronize_rcu();
+        synchronize_rcu_bh();
+        synchronize_sched();
+        synchronize_rcu_expedited();
+        synchronize_rcu_bh_expedited();
+        synchronize_sched_expedited();
+}
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+/*
+ * Switch to run-time mode once RCU has fully initialized.
+ */
+static int __init rcu_set_runtime_mode(void)
+{
+        rcu_test_sync_prims();
+        rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+        rcu_test_sync_prims();
+        return 0;
+}
+core_initcall(rcu_set_runtime_mode);
+#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
 #ifdef CONFIG_PREEMPT_RCU
 /*
@@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t,
                put_task_struct(t);
                return;
        }
+        rcu_request_urgent_qs_task(t);
        if (!needreport)
                return;
        if (*firstreport) {
@@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void)
 #endif /* #ifdef CONFIG_TASKS_RCU */
-/*
- * Test each non-SRCU synchronous grace-period wait API.  This is
- * useful just after a change in mode for these primitives, and
- * during early boot.
- */
-void rcu_test_sync_prims(void)
-{
-        if (!IS_ENABLED(CONFIG_PROVE_RCU))
-                return;
-        synchronize_rcu();
-        synchronize_rcu_bh();
-        synchronize_sched();
-        synchronize_rcu_expedited();
-        synchronize_rcu_bh_expedited();
-        synchronize_sched_expedited();
-}
 #ifdef CONFIG_PROVE_RCU
 /*
diff --git a/kernel/relay.c b/kernel/relay.c
index 0e413d9eec8a..39a9dfc69486 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1212,7 +1212,6 @@ static ssize_t subbuf_splice_actor(struct file *in,
                .nr_pages = 0,
                .nr_pages_max = PIPE_DEF_BUFFERS,
                .partial = partial,
-                .flags = flags,
                .ops = &relay_pipe_buf_ops,
                .spd_release = relay_page_release,
        };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..803c3bc274c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000;
 cpumask_var_t cpu_isolated_map;
 /*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
-        __acquires(rq->lock)
-{
-        struct rq *rq;
-        local_irq_disable();
-        rq = this_rq();
-        raw_spin_lock(&rq->lock);
-        return rq;
-}
-/*
 * __task_rq_lock - lock the rq @p resides on.
 */
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
                return;
 #ifdef CONFIG_SCHED_DEBUG
+        if (sched_feat(WARN_DOUBLE_CLOCK))
+                SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
        rq->clock_update_flags |= RQCF_UPDATED;
 #endif
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
        if (delta < 0)
                return;
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
        struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+        struct rq_flags rf;
        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-        raw_spin_lock(&rq->lock);
+        rq_lock(rq, &rf);
        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-        raw_spin_unlock(&rq->lock);
+        rq_unlock(rq, &rf);
        return HRTIMER_NORESTART;
 }
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
 static void __hrtick_start(void *arg)
 {
        struct rq *rq = arg;
+        struct rq_flags rf;
-        raw_spin_lock(&rq->lock);
+        rq_lock(rq, &rf);
        __hrtick_restart(rq);
        rq->hrtick_csd_pending = 0;
-        raw_spin_unlock(&rq->lock);
+        rq_unlock(rq, &rf);
 }
 /*
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-        update_rq_clock(rq);
+        if (!(flags & ENQUEUE_NOCLOCK))
+                update_rq_clock(rq);
        if (!(flags & ENQUEUE_RESTORE))
                sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-        update_rq_clock(rq);
+        if (!(flags & DEQUEUE_NOCLOCK))
+                update_rq_clock(rq);
        if (!(flags & DEQUEUE_SAVE))
                sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 *
 * Returns (locked) new rq. Old rq's lock is released.
 */
-static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+                                   struct task_struct *p, int new_cpu)
 {
        lockdep_assert_held(&rq->lock);
        p->on_rq = TASK_ON_RQ_MIGRATING;
-        dequeue_task(rq, p, 0);
+        dequeue_task(rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, new_cpu);
-        raw_spin_unlock(&rq->lock);
+        rq_unlock(rq, rf);
        rq = cpu_rq(new_cpu);
-        raw_spin_lock(&rq->lock);
+        rq_lock(rq, rf);
        BUG_ON(task_cpu(p) != new_cpu);
        enqueue_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@ -980,7 +977,8 @@ struct migration_arg {
 * So we race with normal scheduler movements, but that's OK, as long
 * as the task is no longer on this CPU.
 */
-static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+                                 struct task_struct *p, int dest_cpu)
 {
        if (unlikely(!cpu_active(dest_cpu)))
                return rq;
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                return rq;
-        rq = move_queued_task(rq, p, dest_cpu);
+        update_rq_clock(rq);
+        rq = move_queued_task(rq, rf, p, dest_cpu);
        return rq;
 }
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
        struct migration_arg *arg = data;
        struct task_struct *p = arg->task;
        struct rq *rq = this_rq();
+        struct rq_flags rf;
        /*
         * The original target CPU might have gone down and we might
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
        sched_ttwu_pending();
        raw_spin_lock(&p->pi_lock);
-        raw_spin_lock(&rq->lock);
+        rq_lock(rq, &rf);
        /*
         * If task_rq(p) != rq, it cannot be migrated here, because we're
         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
         */
        if (task_rq(p) == rq) {
                if (task_on_rq_queued(p))
-                        rq = __migrate_task(rq, p, arg->dest_cpu);
+                        rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
                else
                        p->wake_cpu = arg->dest_cpu;
        }
-        raw_spin_unlock(&rq->lock);
+        rq_unlock(rq, &rf);
        raw_spin_unlock(&p->pi_lock);
        local_irq_enable();
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                 * holding rq->lock.
                 */
                lockdep_assert_held(&rq->lock);
-                dequeue_task(rq, p, DEQUEUE_SAVE);
+                dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
        }
        if (running)
                put_prev_task(rq, p);
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        p->sched_class->set_cpus_allowed(p, new_mask);
        if (queued)
-                enqueue_task(rq, p, ENQUEUE_RESTORE);
+                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
                set_curr_task(rq, p);
 }
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                 * OK, since we're going to drop the lock immediately
                 * afterwards anyway.
                 */
-                rq_unpin_lock(rq, &rf);
+                rq = move_queued_task(rq, &rf, p, dest_cpu);
-                rq = move_queued_task(rq, p, dest_cpu);
-                rq_repin_lock(rq, &rf);
        }
 out:
        task_rq_unlock(rq, p, &rf);
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
        if (task_on_rq_queued(p)) {
                struct rq *src_rq, *dst_rq;
+                struct rq_flags srf, drf;
                src_rq = task_rq(p);
                dst_rq = cpu_rq(cpu);
+                rq_pin_lock(src_rq, &srf);
+                rq_pin_lock(dst_rq, &drf);
                p->on_rq = TASK_ON_RQ_MIGRATING;
                deactivate_task(src_rq, p, 0);
                set_task_cpu(p, cpu);
                activate_task(dst_rq, p, 0);
                p->on_rq = TASK_ON_RQ_QUEUED;
                check_preempt_curr(dst_rq, p, 0);
+                rq_unpin_lock(dst_rq, &drf);
+                rq_unpin_lock(src_rq, &srf);
        } else {
                /*
                 * Task isn't running anymore; make it appear like we migrated
@@ -1680,7 +1686,7 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
                 struct rq_flags *rf)
 {
-        int en_flags = ENQUEUE_WAKEUP;
+        int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
        lockdep_assert_held(&rq->lock);
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
        struct rq *rq = this_rq();
        struct llist_node *llist = llist_del_all(&rq->wake_list);
        struct task_struct *p;
-        unsigned long flags;
        struct rq_flags rf;
        if (!llist)
                return;
-        raw_spin_lock_irqsave(&rq->lock, flags);
+        rq_lock_irqsave(rq, &rf);
-        rq_pin_lock(rq, &rf);
+        update_rq_clock(rq);
        while (llist) {
                int wake_flags = 0;
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
                ttwu_do_activate(rq, p, wake_flags, &rf);
        }
-        rq_unpin_lock(rq, &rf);
+        rq_unlock_irqrestore(rq, &rf);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 void scheduler_ipi(void)
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 void wake_up_if_idle(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
+        struct rq_flags rf;
        rcu_read_lock();
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
        if (set_nr_if_polling(rq->idle)) {
                trace_sched_wake_idle_without_ipi(cpu);
        } else {
-                raw_spin_lock_irqsave(&rq->lock, flags);
+                rq_lock_irqsave(rq, &rf);
                if (is_idle_task(rq->curr))
                        smp_send_reschedule(cpu);
                /* Else CPU is not idle, do nothing here: */
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
+                rq_unlock_irqrestore(rq, &rf);
        }
 out:
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
        }
 #endif
-        raw_spin_lock(&rq->lock);
+        rq_lock(rq, &rf);
-        rq_pin_lock(rq, &rf);
+        update_rq_clock(rq);
        ttwu_do_activate(rq, p, wake_flags, &rf);
-        rq_unpin_lock(rq, &rf);
+        rq_unlock(rq, &rf);
-        raw_spin_unlock(&rq->lock);
 }
 /*
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
                 * disabled avoiding further scheduler activity on it and we've
                 * not yet picked a replacement task.
                 */
-                rq_unpin_lock(rq, rf);
+                rq_unlock(rq, rf);
-                raw_spin_unlock(&rq->lock);
                raw_spin_lock(&p->pi_lock);
-                raw_spin_lock(&rq->lock);
+                rq_relock(rq, rf);
-                rq_repin_lock(rq, rf);
        }
        if (!(p->state & TASK_NORMAL))
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
                        delayacct_blkio_end();
                        atomic_dec(&rq->nr_iowait);
                }
-                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+                ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
        }
        ttwu_do_wakeup(rq, p, 0, rf);
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
        update_rq_clock(rq);
        post_init_entity_util_avg(&p->se);
-        activate_task(rq, p, 0);
+        activate_task(rq, p, ENQUEUE_NOCLOCK);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
        struct task_struct *curr = rq->curr;
+        struct rq_flags rf;
        sched_clock_tick();
-        raw_spin_lock(&rq->lock);
+        rq_lock(rq, &rf);
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        cpu_load_update_active(rq);
        calc_global_load_tick(rq);
-        raw_spin_unlock(&rq->lock);
+        rq_unlock(rq, &rf);
        perf_event_task_tick();
@@ -3378,7 +3382,7 @@ static void __sched notrace __schedule(bool preempt)
                hrtick_clear(rq);
        local_irq_disable();
-        rcu_note_context_switch();
+        rcu_note_context_switch(preempt);
        /*
         * Make sure that signal_pending_state()->signal_pending() below
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
         * done by the caller to avoid the race with signal_wake_up().
         */
        smp_mb__before_spinlock();
-        raw_spin_lock(&rq->lock);
+        rq_lock(rq, &rf);
-        rq_pin_lock(rq, &rf);
        /* Promote REQ to ACT */
        rq->clock_update_flags <<= 1;
+        update_rq_clock(rq);
        switch_count = &prev->nivcsw;
        if (!preempt && prev->state) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
-                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
                        prev->on_rq = 0;
                        if (prev->in_iowait) {
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
                switch_count = &prev->nvcsw;
        }
-        if (task_on_rq_queued(prev))
-                update_rq_clock(rq);
        next = pick_next_task(rq, prev, &rf);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
                rq = context_switch(rq, prev, next, &rf);
        } else {
                rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-                rq_unpin_lock(rq, &rf);
+                rq_unlock_irq(rq, &rf);
-                raw_spin_unlock_irq(&rq->lock);
        }
        balance_callback(rq);
@@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+        /*
+         * As this skips calling sched_submit_work(), which the idle task does
+         * regardless because that function is a nop when the task is in a
+         * TASK_RUNNING state, make sure this isn't used someplace that the
+         * current task can be in any other state. Note, idle is always in the
+         * TASK_RUNNING state.
+         */
+        WARN_ON_ONCE(current->state);
+        do {
+                __schedule(false);
+        } while (need_resched());
+}
 #ifdef CONFIG_CONTEXT_TRACKING
 asmlinkage __visible void __sched schedule_user(void)
 {
@@ -3671,10 +3696,25 @@ EXPORT_SYMBOL(default_wake_function);
 #ifdef CONFIG_RT_MUTEXES
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+        if (pi_task)
+                prio = min(prio, pi_task->prio);
+        return prio;
+}
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+        struct task_struct *pi_task = rt_mutex_get_top_task(p);
+        return __rt_effective_prio(pi_task, prio);
+}
 /*
 * rt_mutex_setprio - set the current priority of a task
- * @p: task
+ * @p: task to boost
- * @prio: prio value (kernel-internal form)
+ * @pi_task: donor task
 *
 * This function changes the 'effective' priority of a task. It does
 * not touch ->normal_prio like __setscheduler().
@@ -3682,17 +3722,42 @@ EXPORT_SYMBOL(default_wake_function);
 * Used by the rt_mutex code to implement priority inheritance
 * logic. Call site only calls if the priority of the task changed.
 */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-        int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+        int prio, oldprio, queued, running, queue_flag =
+                DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        const struct sched_class *prev_class;
        struct rq_flags rf;
        struct rq *rq;
-        BUG_ON(prio > MAX_PRIO);
+        /* XXX used to be waiter->prio, not waiter->task->prio */
+        prio = __rt_effective_prio(pi_task, p->normal_prio);
+        /*
+         * If nothing changed; bail early.
+         */
+        if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+                return;
        rq = __task_rq_lock(p, &rf);
        update_rq_clock(rq);
+        /*
+         * Set under pi_lock && rq->lock, such that the value can be used under
+         * either lock.
+         *
+         * Note that there is loads of tricky to make this pointer cache work
+         * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+         * ensure a task is de-boosted (pi_task is set to NULL) before the
+         * task is allowed to run again (and can exit). This ensures the pointer
+         * points to a blocked task -- which guaratees the task is present.
+         */
+        p->pi_top_task = pi_task;
+        /*
+         * For FIFO/RR we only need to set prio, if that matches we're done.
+         */
+        if (prio == p->prio && !dl_prio(prio))
+                goto out_unlock;
        /*
         * Idle task boosting is a nono in general. There is one
@@ -3712,7 +3777,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                goto out_unlock;
        }
-        trace_sched_pi_setprio(p, prio);
+        trace_sched_pi_setprio(p, pi_task);
        oldprio = p->prio;
        if (oldprio == prio)
@@ -3736,7 +3801,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         *          running task
         */
        if (dl_prio(prio)) {
-                struct task_struct *pi_task = rt_mutex_get_top_task(p);
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
@@ -3774,6 +3838,11 @@ out_unlock:
        balance_callback(rq);
        preempt_enable();
 }
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+        return prio;
+}
 #endif
 void set_user_nice(struct task_struct *p, long nice)
@@ -3805,7 +3874,7 @@ void set_user_nice(struct task_struct *p, long nice)
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-                dequeue_task(rq, p, DEQUEUE_SAVE);
+                dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
        if (running)
                put_prev_task(rq, p);
@@ -3816,7 +3885,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
        if (queued) {
-                enqueue_task(rq, p, ENQUEUE_RESTORE);
+                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4020,10 +4089,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
         * Keep a potential priority boosting if called from
         * sched_setscheduler().
         */
+        p->prio = normal_prio(p);
        if (keep_boost)
-                p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+                p->prio = rt_effective_prio(p, p->prio);
-        else
-                p->prio = normal_prio(p);
        if (dl_prio(p->prio))
                p->sched_class = &dl_sched_class;
@@ -4126,7 +4194,7 @@ static int __sched_setscheduler(struct task_struct *p,
        const struct sched_class *prev_class;
        struct rq_flags rf;
        int reset_on_fork;
-        int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+        int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        struct rq *rq;
        /* May grab non-irq protected spin_locks: */
@@ -4310,7 +4378,7 @@ change:
                 * the runqueue. This will be done when the task deboost
                 * itself.
                 */
-                new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+                new_effective_prio = rt_effective_prio(p, newprio);
                if (new_effective_prio == oldprio)
                        queue_flags &= ~DEQUEUE_MOVE;
        }
@@ -4923,7 +4991,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 */
 SYSCALL_DEFINE0(sched_yield)
 {
-        struct rq *rq = this_rq_lock();
+        struct rq_flags rf;
+        struct rq *rq;
+        local_irq_disable();
+        rq = this_rq();
+        rq_lock(rq, &rf);
        schedstat_inc(rq->yld_count);
        current->sched_class->yield_task(rq);
@@ -4932,9 +5005,8 @@ SYSCALL_DEFINE0(sched_yield)
         * Since we are going to call schedule() anyway, there's
         * no need to preempt or enable interrupts:
         */
-        __release(rq->lock);
+        preempt_disable();
-        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+        rq_unlock(rq, &rf);
-        do_raw_spin_unlock(&rq->lock);
        sched_preempt_enable_no_resched();
        schedule();
@@ -5514,7 +5586,7 @@ void sched_setnuma(struct task_struct *p, int nid)
        p->numa_preferred_nid = nid;
        if (queued)
-                enqueue_task(rq, p, ENQUEUE_RESTORE);
+                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
                set_curr_task(rq, p);
        task_rq_unlock(rq, p, &rf);
@@ -5579,11 +5651,11 @@ static struct task_struct fake_task = {
 * there's no concurrency possible, we hold the required locks anyway
 * because of lock validation efforts.
 */
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 {
        struct rq *rq = dead_rq;
        struct task_struct *next, *stop = rq->stop;
-        struct rq_flags rf;
+        struct rq_flags orf = *rf;
        int dest_cpu;
        /*
@@ -5602,9 +5674,7 @@ static void migrate_tasks(struct rq *dead_rq)
         * class method both need to have an up-to-date
         * value of rq->clock[_task]
         */
-        rq_pin_lock(rq, &rf);
        update_rq_clock(rq);
-        rq_unpin_lock(rq, &rf);
        for (;;) {
                /*
@@ -5617,8 +5687,7 @@ static void migrate_tasks(struct rq *dead_rq)
                /*
                 * pick_next_task() assumes pinned rq->lock:
                 */
-                rq_repin_lock(rq, &rf);
+                next = pick_next_task(rq, &fake_task, rf);
-                next = pick_next_task(rq, &fake_task, &rf);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
@@ -5631,10 +5700,9 @@ static void migrate_tasks(struct rq *dead_rq)
                 * because !cpu_active at this point, which means load-balance
                 * will not interfere. Also, stop-machine.
                 */
-                rq_unpin_lock(rq, &rf);
+                rq_unlock(rq, rf);
-                raw_spin_unlock(&rq->lock);
                raw_spin_lock(&next->pi_lock);
-                raw_spin_lock(&rq->lock);
+                rq_relock(rq, rf);
                /*
                 * Since we're inside stop-machine, _nothing_ should have
@@ -5648,12 +5716,12 @@ static void migrate_tasks(struct rq *dead_rq)
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+                rq = __migrate_task(rq, rf, next, dest_cpu);
-                rq = __migrate_task(rq, next, dest_cpu);
                if (rq != dead_rq) {
-                        raw_spin_unlock(&rq->lock);
+                        rq_unlock(rq, rf);
                        rq = dead_rq;
-                        raw_spin_lock(&rq->lock);
+                        *rf = orf;
+                        rq_relock(rq, rf);
                }
                raw_spin_unlock(&next->pi_lock);
        }
@@ -5732,7 +5800,7 @@ static void cpuset_cpu_active(void)
                 * cpuset configurations.
                 */
        }
-        cpuset_update_active_cpus(true);
+        cpuset_update_active_cpus();
 }
 static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5755,7 +5823,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
                if (overflow)
                        return -EBUSY;
-                cpuset_update_active_cpus(false);
+                cpuset_update_active_cpus();
        } else {
                num_cpus_frozen++;
                partition_sched_domains(1, NULL, NULL);
@@ -5766,7 +5834,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 int sched_cpu_activate(unsigned int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
+        struct rq_flags rf;
        set_cpu_active(cpu, true);
@@ -5784,12 +5852,12 @@ int sched_cpu_activate(unsigned int cpu)
         * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
         *    domains.
         */
-        raw_spin_lock_irqsave(&rq->lock, flags);
+        rq_lock_irqsave(rq, &rf);
        if (rq->rd) {
                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                set_rq_online(rq);
        }
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        rq_unlock_irqrestore(rq, &rf);
        update_max_interval();
@@ -5847,18 +5915,20 @@ int sched_cpu_starting(unsigned int cpu)
 int sched_cpu_dying(unsigned int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
+        struct rq_flags rf;
        /* Handle pending wakeups and then migrate everything off */
        sched_ttwu_pending();
-        raw_spin_lock_irqsave(&rq->lock, flags);
+        rq_lock_irqsave(rq, &rf);
        if (rq->rd) {
                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                set_rq_offline(rq);
        }
-        migrate_tasks(rq);
+        migrate_tasks(rq, &rf);
        BUG_ON(rq->nr_running != 1);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        rq_unlock_irqrestore(rq, &rf);
        calc_load_migrate(rq);
        update_max_interval();
        nohz_balance_exit_idle(cpu);
@@ -6412,7 +6482,8 @@ static void sched_change_group(struct task_struct *tsk, int type)
 */
 void sched_move_task(struct task_struct *tsk)
 {
-        int queued, running;
+        int queued, running, queue_flags =
+                DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        struct rq_flags rf;
        struct rq *rq;
@@ -6423,14 +6494,14 @@ void sched_move_task(struct task_struct *tsk)
        queued = task_on_rq_queued(tsk);
        if (queued)
-                dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+                dequeue_task(rq, tsk, queue_flags);
        if (running)
                put_prev_task(rq, tsk);
        sched_change_group(tsk, TASK_MOVE_GROUP);
        if (queued)
-                enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+                enqueue_task(rq, tsk, queue_flags);
        if (running)
                set_curr_task(rq, tsk);
@@ -7008,14 +7079,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        for_each_online_cpu(i) {
                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                struct rq *rq = cfs_rq->rq;
+                struct rq_flags rf;
-                raw_spin_lock_irq(&rq->lock);
+                rq_lock_irq(rq, &rf);
                cfs_rq->runtime_enabled = runtime_enabled;
                cfs_rq->runtime_remaining = 0;
                if (cfs_rq->throttled)
                        unthrottle_cfs_rq(cfs_rq);
-                raw_spin_unlock_irq(&rq->lock);
+                rq_unlock_irq(rq, &rf);
        }
        if (runtime_was_enabled && !runtime_enabled)
                cfs_bandwidth_usage_dec();
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 54c577578da6..622eed1b7658 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -61,6 +61,11 @@ struct sugov_cpu {
        unsigned long util;
        unsigned long max;
        unsigned int flags;
+        /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+        unsigned long saved_idle_calls;
+#endif
 };
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -93,22 +98,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 {
        struct cpufreq_policy *policy = sg_policy->policy;
+        if (sg_policy->next_freq == next_freq)
+                return;
+        if (sg_policy->next_freq > next_freq)
+                next_freq = (sg_policy->next_freq + next_freq) >> 1;
+        sg_policy->next_freq = next_freq;
        sg_policy->last_freq_update_time = time;
        if (policy->fast_switch_enabled) {
-                if (sg_policy->next_freq == next_freq) {
-                        trace_cpu_frequency(policy->cur, smp_processor_id());
-                        return;
-                }
-                sg_policy->next_freq = next_freq;
                next_freq = cpufreq_driver_fast_switch(policy, next_freq);
                if (next_freq == CPUFREQ_ENTRY_INVALID)
                        return;
                policy->cur = next_freq;
                trace_cpu_frequency(next_freq, smp_processor_id());
-        } else if (sg_policy->next_freq != next_freq) {
+        } else {
-                sg_policy->next_freq = next_freq;
                sg_policy->work_in_progress = true;
                irq_work_queue(&sg_policy->irq_work);
        }
@@ -192,6 +198,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
        sg_cpu->iowait_boost >>= 1;
 }
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+        unsigned long idle_calls = tick_nohz_get_idle_calls();
+        bool ret = idle_calls == sg_cpu->saved_idle_calls;
+        sg_cpu->saved_idle_calls = idle_calls;
+        return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
 static void sugov_update_single(struct update_util_data *hook, u64 time,
                                unsigned int flags)
 {
@@ -200,6 +219,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
        struct cpufreq_policy *policy = sg_policy->policy;
        unsigned long util, max;
        unsigned int next_f;
+        bool busy;
        sugov_set_iowait_boost(sg_cpu, time, flags);
        sg_cpu->last_update = time;
@@ -207,40 +227,36 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
        if (!sugov_should_update_freq(sg_policy, time))
                return;
+        busy = sugov_cpu_is_busy(sg_cpu);
        if (flags & SCHED_CPUFREQ_RT_DL) {
                next_f = policy->cpuinfo.max_freq;
        } else {
                sugov_get_util(&util, &max);
                sugov_iowait_boost(sg_cpu, &util, &max);
                next_f = get_next_freq(sg_policy, util, max);
+                /*
+                 * Do not reduce the frequency if the CPU has not been idle
+                 * recently, as the reduction is likely to be premature then.
+                 */
+                if (busy && next_f < sg_policy->next_freq)
+                        next_f = sg_policy->next_freq;
        }
        sugov_update_commit(sg_policy, time, next_f);
 }
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
-                                           unsigned long util, unsigned long max,
-                                           unsigned int flags)
 {
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
        struct cpufreq_policy *policy = sg_policy->policy;
-        unsigned int max_f = policy->cpuinfo.max_freq;
+        unsigned long util = 0, max = 1;
-        u64 last_freq_update_time = sg_policy->last_freq_update_time;
        unsigned int j;
-        if (flags & SCHED_CPUFREQ_RT_DL)
-                return max_f;
-        sugov_iowait_boost(sg_cpu, &util, &max);
        for_each_cpu(j, policy->cpus) {
-                struct sugov_cpu *j_sg_cpu;
+                struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
                unsigned long j_util, j_max;
                s64 delta_ns;
-                if (j == smp_processor_id())
-                        continue;
-                j_sg_cpu = &per_cpu(sugov_cpu, j);
                /*
                 * If the CPU utilization was last updated before the previous
                 * frequency update and the time elapsed between the last update
@@ -248,13 +264,13 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
                 * enough, don't take the CPU into account as it probably is
                 * idle now (and clear iowait_boost for it).
                 */
-                delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+                delta_ns = time - j_sg_cpu->last_update;
                if (delta_ns > TICK_NSEC) {
                        j_sg_cpu->iowait_boost = 0;
                        continue;
                }
                if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
-                        return max_f;
+                        return policy->cpuinfo.max_freq;
                j_util = j_sg_cpu->util;
                j_max = j_sg_cpu->max;
@@ -289,7 +305,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
        sg_cpu->last_update = time;
        if (sugov_should_update_freq(sg_policy, time)) {
-                next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+                if (flags & SCHED_CPUFREQ_RT_DL)
+                        next_f = sg_policy->policy->cpuinfo.max_freq;
+                else
+                        next_f = sugov_next_freq_shared(sg_cpu, time);
                sugov_update_commit(sg_policy, time, next_f);
        }
@@ -473,7 +493,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 {
        struct sugov_policy *sg_policy;
        struct sugov_tunables *tunables;
-        unsigned int lat;
        int ret = 0;
        /* State should be equivalent to EXIT */
@@ -512,10 +531,16 @@ static int sugov_init(struct cpufreq_policy *policy)
                goto stop_kthread;
        }
-        tunables->rate_limit_us = LATENCY_MULTIPLIER;
+        if (policy->transition_delay_us) {
-        lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+                tunables->rate_limit_us = policy->transition_delay_us;
-        if (lat)
+        } else {
-                tunables->rate_limit_us *= lat;
+                unsigned int lat;
+                tunables->rate_limit_us = LATENCY_MULTIPLIER;
+                lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+                if (lat)
+                        tunables->rate_limit_us *= lat;
+        }
        policy->governor_data = sg_policy;
        sg_policy->tunables = tunables;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b46c8..aea3135c5d90 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+                                  enum cpu_usage_stat idx)
+{
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
+        u64_stats_update_begin(&irqtime->sync);
+        cpustat[idx] += delta;
+        irqtime->total += delta;
+        irqtime->tick_delta += delta;
+        u64_stats_update_end(&irqtime->sync);
+}
 /*
 * Called before incrementing preempt_count on {soft,}irq_enter
 * and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
 void irqtime_account_irq(struct task_struct *curr)
 {
        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
        s64 delta;
        int cpu;
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
        delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
        irqtime->irq_start_time += delta;
-        u64_stats_update_begin(&irqtime->sync);
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
         * in that case, so as not to confuse scheduler with a special task
         * that do not consume any time, but still wants to run.
         */
-        if (hardirq_count()) {
+        if (hardirq_count())
-                cpustat[CPUTIME_IRQ] += delta;
+                irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
-                irqtime->tick_delta += delta;
+        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-        } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
+                irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
-                cpustat[CPUTIME_SOFTIRQ] += delta;
-                irqtime->tick_delta += delta;
-        }
-        u64_stats_update_end(&irqtime->sync);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dea138964b91..d71109321841 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
+#include "sched-pelt.h"
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- * dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
 {
@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
-        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
-        0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
-        0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
-        0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
-        0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
-        0x85aac367, 0x82cd8698,
-};
-/*
- * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
- * over-estimates when re-combining.
- */
-static const u32 runnable_avg_yN_sum[] = {
-            0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
-         9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
-        17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
-/*
- * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers. See Documentation/scheduler/sched-avg.txt how these
- * were generated:
- */
-static const u32 __accumulated_sum_N32[] = {
-            0, 23371, 35056, 40899, 43820, 45281,
-        46011, 46376, 46559, 46650, 46696, 46719,
-};
 /*
 * Approximate:
 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
 */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
 {
        unsigned int local_n;
-        if (!n)
+        if (unlikely(n > LOAD_AVG_PERIOD * 63))
-                return val;
-        else if (unlikely(n > LOAD_AVG_PERIOD * 63))
                return 0;
        /* after bounds checking we can collapse to 32-bit */
@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n)
        return val;
 }
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+        u32 c1, c2, c3 = d3; /* y^0 == 1 */
+        /*
+         * c1 = d1 y^p
+         */
+        c1 = decay_load((u64)d1, periods);
+        /*
+         *            p-1
+         * c2 = 1024 \Sum y^n
+         *            n=1
+         *
+         *              inf        inf
+         *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+         *              n=0        n=p
+         */
+        c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+        return c1 + c2 + c3;
+}
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 /*
- * For updates fully spanning n periods, the contribution to runnable
+ * Accumulate the three separate parts of the sum; d1 the remainder
- * average will be: \Sum 1024*y^n
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ *           d1          d2           d3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                           p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ *                           n=1
 *
- * We can compute this reasonably efficiently by combining:
+ *    = u y^p +                                 (Step 1)
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ *
+ *                     p-1
+ *      d1 y^p + 1024 \Sum y^n + d3 y^0         (Step 2)
+ *                     n=1
 */
-static u32 __compute_runnable_contrib(u64 n)
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+               unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-        u32 contrib = 0;
+        unsigned long scale_freq, scale_cpu;
+        u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+        u64 periods;
-        if (likely(n <= LOAD_AVG_PERIOD))
+        scale_freq = arch_scale_freq_capacity(NULL, cpu);
-                return runnable_avg_yN_sum[n];
+        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-        else if (unlikely(n >= LOAD_AVG_MAX_N))
-                return LOAD_AVG_MAX;
-        /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+        delta += sa->period_contrib;
-        contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+        periods = delta / 1024; /* A period is 1024us (~1ms) */
-        n %= LOAD_AVG_PERIOD;
-        contrib = decay_load(contrib, n);
-        return contrib + runnable_avg_yN_sum[n];
-}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+        /*
+         * Step 1: decay old *_sum if we crossed period boundaries.
+         */
+        if (periods) {
+                sa->load_sum = decay_load(sa->load_sum, periods);
+                if (cfs_rq) {
+                        cfs_rq->runnable_load_sum =
+                                decay_load(cfs_rq->runnable_load_sum, periods);
+                }
+                sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+                /*
+                 * Step 2
+                 */
+                delta %= 1024;
+                contrib = __accumulate_pelt_segments(periods,
+                                1024 - sa->period_contrib, delta);
+        }
+        sa->period_contrib = delta;
+        contrib = cap_scale(contrib, scale_freq);
+        if (weight) {
+                sa->load_sum += weight * contrib;
+                if (cfs_rq)
+                        cfs_rq->runnable_load_sum += weight * contrib;
+        }
+        if (running)
+                sa->util_sum += contrib * scale_cpu;
+        return periods;
+}
 /*
 * We can represent the historical contribution to runnable average as the
@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n)
 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 */
 static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-        u64 delta, scaled_delta, periods;
+        u64 delta;
-        u32 contrib;
-        unsigned int delta_w, scaled_delta_w, decayed = 0;
-        unsigned long scale_freq, scale_cpu;
        delta = now - sa->last_update_time;
        /*
@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
        delta >>= 10;
        if (!delta)
                return 0;
-        sa->last_update_time = now;
-        scale_freq = arch_scale_freq_capacity(NULL, cpu);
-        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-        /* delta_w is the amount already accumulated against our next period */
-        delta_w = sa->period_contrib;
-        if (delta + delta_w >= 1024) {
-                decayed = 1;
-                /* how much left for next period will start over, we don't know yet */
+        sa->last_update_time += delta << 10;
-                sa->period_contrib = 0;
-                /*
+        /*
-                 * Now that we know we're crossing a period boundary, figure
+         * Now we know we crossed measurement unit boundaries. The *_avg
-                 * out how much from delta we need to complete the current
+         * accrues by two steps:
-                 * period and accrue it.
+         *
-                 */
+         * Step 1: accumulate *_sum since last_update_time. If we haven't
-                delta_w = 1024 - delta_w;
+         * crossed period boundaries, finish.
-                scaled_delta_w = cap_scale(delta_w, scale_freq);
+         */
-                if (weight) {
+        if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
-                        sa->load_sum += weight * scaled_delta_w;
+                return 0;
-                        if (cfs_rq) {
-                                cfs_rq->runnable_load_sum +=
-                                                weight * scaled_delta_w;
-                        }
-                }
-                if (running)
-                        sa->util_sum += scaled_delta_w * scale_cpu;
-                delta -= delta_w;
-                /* Figure out how many additional periods this update spans */
-                periods = delta / 1024;
-                delta %= 1024;
-                sa->load_sum = decay_load(sa->load_sum, periods + 1);
+        /*
-                if (cfs_rq) {
+         * Step 2: update *_avg.
-                        cfs_rq->runnable_load_sum =
+         */
-                                decay_load(cfs_rq->runnable_load_sum, periods + 1);
+        sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-                }
+        if (cfs_rq) {
-                sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+                cfs_rq->runnable_load_avg =
+                        div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
-                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-                contrib = __compute_runnable_contrib(periods);
-                contrib = cap_scale(contrib, scale_freq);
-                if (weight) {
-                        sa->load_sum += weight * contrib;
-                        if (cfs_rq)
-                                cfs_rq->runnable_load_sum += weight * contrib;
-                }
-                if (running)
-                        sa->util_sum += contrib * scale_cpu;
        }
+        sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
-        /* Remainder of delta accrued against u_0` */
+        return 1;
-        scaled_delta = cap_scale(delta, scale_freq);
+}
-        if (weight) {
-                sa->load_sum += weight * scaled_delta;
-                if (cfs_rq)
-                        cfs_rq->runnable_load_sum += weight * scaled_delta;
-        }
-        if (running)
-                sa->util_sum += scaled_delta * scale_cpu;
-        sa->period_contrib += delta;
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+        return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+}
-        if (decayed) {
+static int
-                sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
-                if (cfs_rq) {
+{
-                        cfs_rq->runnable_load_avg =
+        return ___update_load_avg(now, cpu, &se->avg,
-                                div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+                                  se->on_rq * scale_load_down(se->load.weight),
-                }
+                                  cfs_rq->curr == se, NULL);
-                sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+}
-        }
-        return decayed;
+static int
+__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+        return ___update_load_avg(now, cpu, &cfs_rq->avg,
+                        scale_load_down(cfs_rq->load.weight),
+                        cfs_rq->curr != NULL, cfs_rq);
 }
 /*
@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 void set_task_rq_fair(struct sched_entity *se,
                      struct cfs_rq *prev, struct cfs_rq *next)
 {
+        u64 p_last_update_time;
+        u64 n_last_update_time;
        if (!sched_feat(ATTACH_AGE_LOAD))
                return;
@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se,
         * time. This will result in the wakee task is less decayed, but giving
         * the wakee more load sounds not bad.
         */
-        if (se->avg.last_update_time && prev) {
+        if (!(se->avg.last_update_time && prev))
-                u64 p_last_update_time;
+                return;
-                u64 n_last_update_time;
 #ifndef CONFIG_64BIT
+        {
                u64 p_last_update_time_copy;
                u64 n_last_update_time_copy;
@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se,
                } while (p_last_update_time != p_last_update_time_copy ||
                         n_last_update_time != n_last_update_time_copy);
+        }
 #else
-                p_last_update_time = prev->avg.last_update_time;
+        p_last_update_time = prev->avg.last_update_time;
-                n_last_update_time = next->avg.last_update_time;
+        n_last_update_time = next->avg.last_update_time;
 #endif
-                __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+        __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
-                                  &se->avg, 0, 0, NULL);
+        se->avg.last_update_time = n_last_update_time;
-                se->avg.last_update_time = n_last_update_time;
-        }
 }
 /* Take into account change of utilization of a child task group */
@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
        return 1;
 }
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+        /*
+         * If sched_entity still have not zero load or utilization, we have to
+         * decay it:
+         */
+        if (se->avg.load_avg || se->avg.util_avg)
+                return false;
+        /*
+         * If there is a pending propagation, we have to update the load and
+         * the utilization of the sched_entity:
+         */
+        if (gcfs_rq->propagate_avg)
+                return false;
+        /*
+         * Otherwise, the load and the utilization of the sched_entity is
+         * already zero and there is no pending propagation, so it will be a
+         * waste of time to try to decay it:
+         */
+        return true;
+}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
                set_tg_cfs_propagate(cfs_rq);
        }
-        decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+        decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
-                scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
 #ifndef CONFIG_64BIT
        smp_wmb();
@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
         * Track task load average for carrying it to new CPU after migrated, and
         * track group sched_entity load average for task_h_load calc in migration
         */
-        if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+        if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
-                __update_load_avg(now, cpu, &se->avg,
+                __update_load_avg_se(now, cpu, cfs_rq, se);
-                          se->on_rq * scale_load_down(se->load.weight),
-                          cfs_rq->curr == se, NULL);
-        }
        decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
        decayed |= propagate_entity_load_avg(se);
@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se)
        u64 last_update_time;
        last_update_time = cfs_rq_last_update_time(cfs_rq);
-        __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+        __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
 }
 /*
@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
                                throttled_list) {
                struct rq *rq = rq_of(cfs_rq);
+                struct rq_flags rf;
-                raw_spin_lock(&rq->lock);
+                rq_lock(rq, &rf);
                if (!cfs_rq_throttled(cfs_rq))
                        goto next;
@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                        unthrottle_cfs_rq(cfs_rq);
 next:
-                raw_spin_unlock(&rq->lock);
+                rq_unlock(rq, &rf);
                if (!remaining)
                        break;
@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void)
        unsigned long curr_jiffies = READ_ONCE(jiffies);
        struct rq *this_rq = this_rq();
        unsigned long load;
+        struct rq_flags rf;
        if (curr_jiffies == this_rq->last_load_update_tick)
                return;
        load = weighted_cpuload(cpu_of(this_rq));
-        raw_spin_lock(&this_rq->lock);
+        rq_lock(this_rq, &rf);
        update_rq_clock(this_rq);
        cpu_load_update_nohz(this_rq, curr_jiffies, load);
-        raw_spin_unlock(&this_rq->lock);
+        rq_unlock(this_rq, &rf);
 }
 #else /* !CONFIG_NO_HZ_COMMON */
 static inline void cpu_load_update_nohz(struct rq *this_rq,
@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
        lockdep_assert_held(&env->src_rq->lock);
        p->on_rq = TASK_ON_RQ_MIGRATING;
-        deactivate_task(env->src_rq, p, 0);
+        deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, env->dst_cpu);
 }
@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
        lockdep_assert_held(&rq->lock);
        BUG_ON(task_rq(p) != rq);
-        activate_task(rq, p, 0);
+        activate_task(rq, p, ENQUEUE_NOCLOCK);
        p->on_rq = TASK_ON_RQ_QUEUED;
        check_preempt_curr(rq, p, 0);
 }
@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 */
 static void attach_one_task(struct rq *rq, struct task_struct *p)
 {
-        raw_spin_lock(&rq->lock);
+        struct rq_flags rf;
+        rq_lock(rq, &rf);
+        update_rq_clock(rq);
        attach_task(rq, p);
-        raw_spin_unlock(&rq->lock);
+        rq_unlock(rq, &rf);
 }
 /*
@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env)
 {
        struct list_head *tasks = &env->tasks;
        struct task_struct *p;
+        struct rq_flags rf;
-        raw_spin_lock(&env->dst_rq->lock);
+        rq_lock(env->dst_rq, &rf);
+        update_rq_clock(env->dst_rq);
        while (!list_empty(tasks)) {
                p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env)
                attach_task(env->dst_rq, p);
        }
-        raw_spin_unlock(&env->dst_rq->lock);
+        rq_unlock(env->dst_rq, &rf);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        struct cfs_rq *cfs_rq;
-        unsigned long flags;
+        struct rq_flags rf;
-        raw_spin_lock_irqsave(&rq->lock, flags);
+        rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
        /*
@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu)
         * list_add_leaf_cfs_rq() for details.
         */
        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                struct sched_entity *se;
                /* throttled entities do not contribute to load */
                if (throttled_hierarchy(cfs_rq))
                        continue;
@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu)
                if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
                        update_tg_load_avg(cfs_rq, 0);
-                /* Propagate pending load changes to the parent */
+                /* Propagate pending load changes to the parent, if any: */
-                if (cfs_rq->tg->se[cpu])
+                se = cfs_rq->tg->se[cpu];
-                        update_load_avg(cfs_rq->tg->se[cpu], 0);
+                if (se && !skip_blocked_update(se))
+                        update_load_avg(se, 0);
        }
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        rq_unlock_irqrestore(rq, &rf);
 }
 /*
@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        struct cfs_rq *cfs_rq = &rq->cfs;
-        unsigned long flags;
+        struct rq_flags rf;
-        raw_spin_lock_irqsave(&rq->lock, flags);
+        rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
        update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        rq_unlock_irqrestore(rq, &rf);
 }
 static unsigned long task_h_load(struct task_struct *p)
@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
+        struct sg_lb_stats *local = &sds->local_stat;
        struct sg_lb_stats tmp_sgs;
        int load_idx, prefer_sibling = 0;
        bool overload = false;
@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                if (local_group) {
                        sds->local = sg;
-                        sgs = &sds->local_stat;
+                        sgs = local;
                        if (env->idle != CPU_NEWLY_IDLE ||
                            time_after_eq(jiffies, sg->sgc->next_update))
@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 * the tasks on the system).
                 */
                if (prefer_sibling && sds->local &&
-                    group_has_capacity(env, &sds->local_stat) &&
+                    group_has_capacity(env, local) &&
-                    (sgs->sum_nr_running > 1)) {
+                    (sgs->sum_nr_running > local->sum_nr_running + 1)) {
                        sgs->group_no_capacity = 1;
                        sgs->group_type = group_classify(sg, sgs);
                }
@@ -7597,7 +7631,7 @@ next_group:
 /**
 * check_asym_packing - Check to see if the group is packed into the
- *                      sched doman.
+ *                      sched domain.
 *
 * This is primarily intended to used at the sibling level.  Some
 * cores like POWER7 prefer to use lower numbered SMT threads.  In the
@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct sched_domain *sd_parent = sd->parent;
        struct sched_group *group;
        struct rq *busiest;
-        unsigned long flags;
+        struct rq_flags rf;
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
        struct lb_env env = {
@@ -8105,7 +8139,7 @@ redo:
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 more_balance:
-                raw_spin_lock_irqsave(&busiest->lock, flags);
+                rq_lock_irqsave(busiest, &rf);
                update_rq_clock(busiest);
                /*
@@ -8122,14 +8156,14 @@ more_balance:
                 * See task_rq_lock() family for the details.
                 */
-                raw_spin_unlock(&busiest->lock);
+                rq_unlock(busiest, &rf);
                if (cur_ld_moved) {
                        attach_tasks(&env);
                        ld_moved += cur_ld_moved;
                }
-                local_irq_restore(flags);
+                local_irq_restore(rf.flags);
                if (env.flags & LBF_NEED_BREAK) {
                        env.flags &= ~LBF_NEED_BREAK;
@@ -8207,6 +8241,8 @@ more_balance:
                        sd->nr_balance_failed++;
                if (need_active_balance(&env)) {
+                        unsigned long flags;
                        raw_spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the active_load_balance_cpu_stop,
@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data)
        struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
        struct task_struct *p = NULL;
+        struct rq_flags rf;
-        raw_spin_lock_irq(&busiest_rq->lock);
+        rq_lock_irq(busiest_rq, &rf);
        /* make sure the requested cpu hasn't gone down in the meantime */
        if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data)
        rcu_read_unlock();
 out_unlock:
        busiest_rq->active_balance = 0;
-        raw_spin_unlock(&busiest_rq->lock);
+        rq_unlock(busiest_rq, &rf);
        if (p)
                attach_one_task(target_rq, p);
@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                 * do the balance.
                 */
                if (time_after_eq(jiffies, rq->next_balance)) {
-                        raw_spin_lock_irq(&rq->lock);
+                        struct rq_flags rf;
+                        rq_lock_irq(rq, &rf);
                        update_rq_clock(rq);
                        cpu_load_update_idle(rq);
-                        raw_spin_unlock_irq(&rq->lock);
+                        rq_unlock_irq(rq, &rf);
                        rebalance_domains(rq, CPU_IDLE);
                }
@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p)
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se, *curr;
        struct rq *rq = this_rq();
+        struct rq_flags rf;
-        raw_spin_lock(&rq->lock);
+        rq_lock(rq, &rf);
        update_rq_clock(rq);
        cfs_rq = task_cfs_rq(current);
@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p)
        }
        se->vruntime -= cfs_rq->min_vruntime;
-        raw_spin_unlock(&rq->lock);
+        rq_unlock(rq, &rf);
 }
 /*
@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
        int i;
-        unsigned long flags;
        /*
         * We can't change the weight of the root cgroup.
@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        tg->shares = shares;
        for_each_possible_cpu(i) {
                struct rq *rq = cpu_rq(i);
-                struct sched_entity *se;
+                struct sched_entity *se = tg->se[i];
+                struct rq_flags rf;
-                se = tg->se[i];
                /* Propagate contribution to hierarchy */
-                raw_spin_lock_irqsave(&rq->lock, flags);
+                rq_lock_irqsave(rq, &rf);
-                /* Possible calls to update_curr() need rq clock */
                update_rq_clock(rq);
                for_each_sched_entity(se) {
                        update_load_avg(se, UPDATE_TG);
                        update_cfs_shares(se);
                }
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
+                rq_unlock_irqrestore(rq, &rf);
        }
 done:
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1b3c8189b286..11192e0cb122 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
 */
 SCHED_FEAT(SIS_AVG_CPU, false)
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
 #ifdef HAVE_RT_PUSH_IPI
 /*
 * In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ac6d5176463d..ef63adce0c9c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
 #include <linux/suspend.h>
+#include <linux/livepatch.h>
 #include <asm/tlb.h>
@@ -264,7 +265,10 @@ static void do_idle(void)
        smp_mb__after_atomic();
        sched_ttwu_pending();
-        schedule_preempt_disabled();
+        schedule_idle();
+        if (unlikely(klp_patch_pending(current)))
+                klp_update_patch_state(current);
 }
 bool cpu_in_idle(unsigned long pc)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9f3e40226dec..979b7341008a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
 #define RT_PUSH_IPI_EXECUTING           1
 #define RT_PUSH_IPI_RESTART             2
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ *    (None or zero):           No IPI is going around for the current rq
+ *    RT_PUSH_IPI_EXECUTING:    An IPI for the rq is being passed around
+ *    RT_PUSH_IPI_RESTART:      The priority of the running task for the rq
+ *                              has changed, and the IPI should restart
+ *                              circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
 static void tell_cpu_to_push(struct rq *rq)
 {
        int cpu;
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
new file mode 100644
index 000000000000..cd200d16529e
--- /dev/null
+++ b/kernel/sched/sched-pelt.h
@@ -0,0 +1,13 @@
+/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+static const u32 runnable_avg_yN_inv[] = {
+        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+        0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+        0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+        0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+        0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+        0x85aac367, 0x82cd8698,
+};
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..6dda2aab731e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
 #define DEQUEUE_SLEEP           0x01
 #define DEQUEUE_SAVE            0x02 /* matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE            0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK         0x08 /* matches ENQUEUE_NOCLOCK */
 #define ENQUEUE_WAKEUP          0x01
 #define ENQUEUE_RESTORE         0x02
 #define ENQUEUE_MOVE            0x04
+#define ENQUEUE_NOCLOCK         0x08
-#define ENQUEUE_HEAD            0x08
+#define ENQUEUE_HEAD            0x10
-#define ENQUEUE_REPLENISH       0x10
+#define ENQUEUE_REPLENISH       0x20
 #ifdef CONFIG_SMP
-#define ENQUEUE_MIGRATED        0x20
+#define ENQUEUE_MIGRATED        0x40
 #else
 #define ENQUEUE_MIGRATED        0x00
 #endif
@@ -1465,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 }
 #endif
+extern void schedule_idle(void);
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
@@ -1624,6 +1628,7 @@ static inline void sched_avg_update(struct rq *rq) { }
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
        __acquires(rq->lock);
 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
        __acquires(p->pi_lock)
        __acquires(rq->lock);
@@ -1645,6 +1650,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
        raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+        __acquires(rq->lock)
+{
+        raw_spin_lock_irqsave(&rq->lock, rf->flags);
+        rq_pin_lock(rq, rf);
+}
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+        __acquires(rq->lock)
+{
+        raw_spin_lock_irq(&rq->lock);
+        rq_pin_lock(rq, rf);
+}
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+        __acquires(rq->lock)
+{
+        raw_spin_lock(&rq->lock);
+        rq_pin_lock(rq, rf);
+}
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+        __acquires(rq->lock)
+{
+        raw_spin_lock(&rq->lock);
+        rq_repin_lock(rq, rf);
+}
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+        __releases(rq->lock)
+{
+        rq_unpin_lock(rq, rf);
+        raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+        __releases(rq->lock)
+{
+        rq_unpin_lock(rq, rf);
+        raw_spin_unlock_irq(&rq->lock);
+}
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+        __releases(rq->lock)
+{
+        rq_unpin_lock(rq, rf);
+        raw_spin_unlock(&rq->lock);
+}
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
@@ -1869,6 +1930,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
+        u64                     total;
        u64                     tick_delta;
        u64                     irq_start_time;
        struct u64_stats_sync   sync;
@@ -1876,16 +1938,20 @@ struct irqtime {
 DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
 static inline u64 irq_time_read(int cpu)
 {
        struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
-        u64 *cpustat = kcpustat_cpu(cpu).cpustat;
        unsigned int seq;
        u64 total;
        do {
                seq = __u64_stats_fetch_begin(&irqtime->sync);
-                total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+                total = irqtime->total;
        } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
        return total;
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e59ebc2c25e..ca92bcfeb322 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
                }
                /*
                 * This sighand can be already freed and even reused, but
-                 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+                 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
                 * initializes ->siglock: this slab can't go away, it has
                 * the same object type, ->siglock can't be reinitialized.
                 *
@@ -1318,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
        }
 }
-int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
        int error;
        rcu_read_lock();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 744fa611cae0..4e09821f9d9e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -309,7 +309,7 @@ restart:
        account_irq_exit_time(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
        WARN_ON_ONCE(in_interrupt());
-        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+        current_restore_flags(old_flags, PF_MEMALLOC);
 }
 asmlinkage __visible void do_softirq(void)
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9c15a9124e83..f8edee9c792d 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -54,8 +54,8 @@ int snprint_stack_trace(char *buf, size_t size,
 EXPORT_SYMBOL_GPL(snprint_stack_trace);
 /*
- * Architectures that do not implement save_stack_trace_tsk or
+ * Architectures that do not implement save_stack_trace_*()
- * save_stack_trace_regs get this weak alias and a once-per-bootup warning
+ * get these weak aliases and once-per-bootup warnings
 * (whenever this facility is utilized - for example by procfs):
 */
 __weak void
@@ -69,3 +69,11 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 {
        WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
 }
+__weak int
+save_stack_trace_tsk_reliable(struct task_struct *tsk,
+                              struct stack_trace *trace)
+{
+        WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
+        return -ENOSYS;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 7ff6d1b10cec..8a94b4eabcaa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1396,8 +1396,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
                                !capable(CAP_SYS_RESOURCE))
                        retval = -EPERM;
                if (!retval)
-                        retval = security_task_setrlimit(tsk->group_leader,
+                        retval = security_task_setrlimit(tsk, resource, new_rlim);
-                                        resource, new_rlim);
                if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
                        /*
                         * The caller is asking for an immediate RLIMIT_CPU
@@ -1432,25 +1431,26 @@ out:
 }
 /* rcu lock must be held */
-static int check_prlimit_permission(struct task_struct *task)
+static int check_prlimit_permission(struct task_struct *task,
+                                    unsigned int flags)
 {
        const struct cred *cred = current_cred(), *tcred;
+        bool id_match;
        if (current == task)
                return 0;
        tcred = __task_cred(task);
-        if (uid_eq(cred->uid, tcred->euid) &&
+        id_match = (uid_eq(cred->uid, tcred->euid) &&
-            uid_eq(cred->uid, tcred->suid) &&
+                    uid_eq(cred->uid, tcred->suid) &&
-            uid_eq(cred->uid, tcred->uid)  &&
+                    uid_eq(cred->uid, tcred->uid)  &&
-            gid_eq(cred->gid, tcred->egid) &&
+                    gid_eq(cred->gid, tcred->egid) &&
-            gid_eq(cred->gid, tcred->sgid) &&
+                    gid_eq(cred->gid, tcred->sgid) &&
-            gid_eq(cred->gid, tcred->gid))
+                    gid_eq(cred->gid, tcred->gid));
-                return 0;
+        if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
-        if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
+                return -EPERM;
-                return 0;
-        return -EPERM;
+        return security_task_prlimit(cred, tcred, flags);
 }
 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
@@ -1460,12 +1460,17 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
        struct rlimit64 old64, new64;
        struct rlimit old, new;
        struct task_struct *tsk;
+        unsigned int checkflags = 0;
        int ret;
+        if (old_rlim)
+                checkflags |= LSM_PRLIMIT_READ;
        if (new_rlim) {
                if (copy_from_user(&new64, new_rlim, sizeof(new64)))
                        return -EFAULT;
                rlim64_to_rlim(&new64, &new);
+                checkflags |= LSM_PRLIMIT_WRITE;
        }
        rcu_read_lock();
@@ -1474,7 +1479,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
                rcu_read_unlock();
                return -ESRCH;
        }
-        ret = check_prlimit_permission(tsk);
+        ret = check_prlimit_permission(tsk, checkflags);
        if (ret) {
                rcu_read_unlock();
                return ret;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8c8714fcb53c..4dfba1a76cc3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1176,6 +1176,8 @@ static struct ctl_table kern_table[] = {
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = timer_migration_handler,
+                .extra1         = &zero,
+                .extra2         = &one,
        },
 #endif
 #ifdef CONFIG_BPF_SYSCALL
@@ -2574,7 +2576,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
                                         int write, void *data)
 {
        if (write) {
-                if (*lvalp > LONG_MAX / HZ)
+                if (*lvalp > INT_MAX / HZ)
                        return 1;
                *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
        } else {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8a5e44236f78..4559e914452b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -30,6 +30,7 @@
 #include <linux/pid_namespace.h>
 #include <net/genetlink.h>
 #include <linux/atomic.h>
+#include <linux/sched/cputime.h>
 /*
 * Maximum length of a cpumask that can be specified in
@@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
        struct task_struct *tsk, *first;
        unsigned long flags;
        int rc = -ESRCH;
+        u64 delta, utime, stime;
+        u64 start_time;
        /*
         * Add additional stats from live tasks except zombie thread group
@@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
                memset(stats, 0, sizeof(*stats));
        tsk = first;
+        start_time = ktime_get_ns();
        do {
                if (tsk->exit_state)
                        continue;
@@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
                 */
                delayacct_add_tsk(stats, tsk);
+                /* calculate task elapsed time in nsec */
+                delta = start_time - tsk->start_time;
+                /* Convert to micro seconds */
+                do_div(delta, NSEC_PER_USEC);
+                stats->ac_etime += delta;
+                task_cputime(tsk, &utime, &stime);
+                stats->ac_utime += div_u64(utime, NSEC_PER_USEC);
+                stats->ac_stime += div_u64(stime, NSEC_PER_USEC);
                stats->nvcsw += tsk->nvcsw;
                stats->nivcsw += tsk->nivcsw;
        } while_each_thread(first, tsk);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ce3a31e8eb36..5cb5b0008d97 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 *
 * Returns the granularity of underlying alarm base clock
 */
-static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
        if (!alarmtimer_get_rtcdev())
                return -EINVAL;
@@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
 *
 * Provides the underlying alarm base time.
 */
-static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
 {
        struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
        if (!alarmtimer_get_rtcdev())
                return -EINVAL;
-        *tp = ktime_to_timespec(base->gettime());
+        *tp = ktime_to_timespec64(base->gettime());
        return 0;
 }
@@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer)
 * Copies out the current itimerspec data
 */
 static void alarm_timer_get(struct k_itimer *timr,
-                                struct itimerspec *cur_setting)
+                            struct itimerspec64 *cur_setting)
 {
        ktime_t relative_expiry_time =
                alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
        if (ktime_to_ns(relative_expiry_time) > 0) {
-                cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+                cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
        } else {
                cur_setting->it_value.tv_sec = 0;
                cur_setting->it_value.tv_nsec = 0;
        }
-        cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
+        cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
 }
 /**
@@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr)
 * Sets the timer to new_setting, and starts the timer.
 */
 static int alarm_timer_set(struct k_itimer *timr, int flags,
-                                struct itimerspec *new_setting,
+                           struct itimerspec64 *new_setting,
-                                struct itimerspec *old_setting)
+                           struct itimerspec64 *old_setting)
 {
        ktime_t exp;
@@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
                return TIMER_RETRY;
        /* start the timer */
-        timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+        timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
-        exp = timespec_to_ktime(new_setting->it_value);
+        exp = timespec64_to_ktime(new_setting->it_value);
        /* Convert (if necessary) to absolute time */
        if (flags != TIMER_ABSTIME) {
                ktime_t now;
@@ -790,13 +790,14 @@ out:
 * Handles clock_nanosleep calls against _ALARM clockids
 */
 static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
-                     struct timespec *tsreq, struct timespec __user *rmtp)
+                              struct timespec64 *tsreq,
+                              struct timespec __user *rmtp)
 {
        enum  alarmtimer_type type = clock2alarm(which_clock);
+        struct restart_block *restart;
        struct alarm alarm;
        ktime_t exp;
        int ret = 0;
-        struct restart_block *restart;
        if (!alarmtimer_get_rtcdev())
                return -ENOTSUPP;
@@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
        alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
-        exp = timespec_to_ktime(*tsreq);
+        exp = timespec64_to_ktime(*tsreq);
        /* Convert (if necessary) to absolute time */
        if (flags != TIMER_ABSTIME) {
                ktime_t now = alarm_bases[type].gettime();
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 97ac0951f164..4237e0744e26 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
-void clockevents_config(struct clock_event_device *dev, u32 freq)
+static void clockevents_config(struct clock_event_device *dev, u32 freq)
 {
        u64 sec;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ec08f527d7ee..ac053bb5296e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
- * -1 when the timer is currently excuting the callback function and
+ * -1 when the timer is currently executing the callback function and
 *    cannot be stopped
 */
 int hrtimer_try_to_cancel(struct hrtimer *timer)
@@ -1368,10 +1368,7 @@ retry:
                    ktime_to_ns(delta));
 }
-/*
+/* called with interrupts disabled */
- * local version of hrtimer_peek_ahead_timers() called with interrupts
- * disabled.
- */
 static inline void __hrtimer_peek_ahead_timers(void)
 {
        struct tick_device *td;
@@ -1506,7 +1503,7 @@ out:
        return ret;
 }
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
                       const enum hrtimer_mode mode, const clockid_t clockid)
 {
        struct restart_block *restart;
@@ -1519,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
                slack = 0;
        hrtimer_init_on_stack(&t.timer, clockid, mode);
-        hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
+        hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
        if (do_nanosleep(&t, mode))
                goto out;
@@ -1550,15 +1547,17 @@ out:
 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
                struct timespec __user *, rmtp)
 {
+        struct timespec64 tu64;
        struct timespec tu;
        if (copy_from_user(&tu, rqtp, sizeof(tu)))
                return -EFAULT;
-        if (!timespec_valid(&tu))
+        tu64 = timespec_to_timespec64(tu);
+        if (!timespec64_valid(&tu64))
                return -EINVAL;
-        return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+        return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 /*
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9cff0ab82b63..31d588d37a17 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -297,7 +297,7 @@ out:
        return err;
 }
-static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
 {
        struct posix_clock_desc cd;
        int err;
@@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts)
        return err;
 }
-static int pc_clock_getres(clockid_t id, struct timespec *ts)
+static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
 {
        struct posix_clock_desc cd;
        int err;
@@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts)
        return err;
 }
-static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
 {
        struct posix_clock_desc cd;
        int err;
@@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit)
        return err;
 }
-static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
 {
        clockid_t id = kit->it_clock;
        struct posix_clock_desc cd;
@@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
 }
 static int pc_timer_settime(struct k_itimer *kit, int flags,
-                            struct itimerspec *ts, struct itimerspec *old)
+                            struct itimerspec64 *ts, struct itimerspec64 *old)
 {
        clockid_t id = kit->it_clock;
        struct posix_clock_desc cd;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 4513ad16a253..1370f067fb51 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p)
 }
 static int
-posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
        int error = check_clock(which_clock);
        if (!error) {
@@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 }
 static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
 {
        /*
         * You can never reset a CPU clock, but we check for other errors
@@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 static int posix_cpu_clock_get_task(struct task_struct *tsk,
                                    const clockid_t which_clock,
-                                    struct timespec *tp)
+                                    struct timespec64 *tp)
 {
        int err = -EINVAL;
        u64 rtn;
@@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
        }
        if (!err)
-                *tp = ns_to_timespec(rtn);
+                *tp = ns_to_timespec64(rtn);
        return err;
 }
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
 {
        const pid_t pid = CPUCLOCK_PID(which_clock);
        int err = -EINVAL;
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 * and try again.  (This happens when the timer is in the middle of firing.)
 */
 static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
-                               struct itimerspec *new, struct itimerspec *old)
+                               struct itimerspec64 *new, struct itimerspec64 *old)
 {
        unsigned long flags;
        struct sighand_struct *sighand;
@@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
        WARN_ON_ONCE(p == NULL);
-        new_expires = timespec_to_ns(&new->it_value);
+        new_expires = timespec64_to_ns(&new->it_value);
        /*
         * Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
                        bump_cpu_timer(timer, val);
                        if (val < timer->it.cpu.expires) {
                                old_expires = timer->it.cpu.expires - val;
-                                old->it_value = ns_to_timespec(old_expires);
+                                old->it_value = ns_to_timespec64(old_expires);
                        } else {
                                old->it_value.tv_nsec = 1;
                                old->it_value.tv_sec = 0;
@@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
         * Install the new reload setting, and
         * set up the signal and overrun bookkeeping.
         */
-        timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
+        timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
        /*
         * This acts as a modification timestamp for the timer,
@@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
        ret = 0;
 out:
        if (old)
-                old->it_interval = ns_to_timespec(old_incr);
+                old->it_interval = ns_to_timespec64(old_incr);
        return ret;
 }
-static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
 {
        u64 now;
        struct task_struct *p = timer->it.cpu.task;
@@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
        /*
         * Easy part: convert the reload time.
         */
-        itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
+        itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
        if (timer->it.cpu.expires == 0) {       /* Timer not armed at all.  */
                itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
                         * Call the timer disarmed, nothing else to do.
                         */
                        timer->it.cpu.expires = 0;
-                        itp->it_value = ns_to_timespec(timer->it.cpu.expires);
+                        itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
                        return;
                } else {
                        cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
        }
        if (now < timer->it.cpu.expires) {
-                itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
+                itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
        } else {
                /*
                 * The timer should have expired already, but the firing
@@ -825,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk,
                         * At the hard limit, we just die.
                         * No need to calculate anything else now.
                         */
+                        pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
+                                tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
@@ -836,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk,
                                soft += USEC_PER_SEC;
                                sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
                        }
-                        printk(KERN_INFO
+                        pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
-                                "RT Watchdog Timeout: %s[%d]\n",
                                tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                }
@@ -935,6 +936,8 @@ static void check_process_timers(struct task_struct *tsk,
                         * At the hard limit, we just die.
                         * No need to calculate anything else now.
                         */
+                        pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
+                                tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
@@ -942,6 +945,8 @@ static void check_process_timers(struct task_struct *tsk,
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
+                        pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
+                                tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                        if (soft < hard) {
                                soft++;
@@ -1214,7 +1219,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 }
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
-                            struct timespec *rqtp, struct itimerspec *it)
+                            struct timespec64 *rqtp, struct itimerspec64 *it)
 {
        struct k_itimer timer;
        int error;
@@ -1229,7 +1234,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
        error = posix_cpu_timer_create(&timer);
        timer.it_process = current;
        if (!error) {
-                static struct itimerspec zero_it;
+                static struct itimerspec64 zero_it;
                memset(it, 0, sizeof *it);
                it->it_value = *rqtp;
@@ -1264,7 +1269,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
                /*
                 * We were interrupted by a signal.
                 */
-                *rqtp = ns_to_timespec(timer.it.cpu.expires);
+                *rqtp = ns_to_timespec64(timer.it.cpu.expires);
                error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
                if (!error) {
                        /*
@@ -1301,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-                            struct timespec *rqtp, struct timespec __user *rmtp)
+                            struct timespec64 *rqtp, struct timespec __user *rmtp)
 {
        struct restart_block *restart_block = &current->restart_block;
-        struct itimerspec it;
+        struct itimerspec64 it;
+        struct timespec ts;
        int error;
        /*
@@ -1312,7 +1318,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
         */
        if (CPUCLOCK_PERTHREAD(which_clock) &&
            (CPUCLOCK_PID(which_clock) == 0 ||
-             CPUCLOCK_PID(which_clock) == current->pid))
+             CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
                return -EINVAL;
        error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
@@ -1324,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
                /*
                 * Report back to the user the time still remaining.
                 */
-                if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                ts = timespec64_to_timespec(it.it_value);
+                if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
                        return -EFAULT;
                restart_block->fn = posix_cpu_nsleep_restart;
                restart_block->nanosleep.clockid = which_clock;
                restart_block->nanosleep.rmtp = rmtp;
-                restart_block->nanosleep.expires = timespec_to_ns(rqtp);
+                restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
        }
        return error;
 }
@@ -1338,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
        clockid_t which_clock = restart_block->nanosleep.clockid;
-        struct timespec t;
+        struct itimerspec64 it;
-        struct itimerspec it;
+        struct timespec64 t;
+        struct timespec tmp;
        int error;
-        t = ns_to_timespec(restart_block->nanosleep.expires);
+        t = ns_to_timespec64(restart_block->nanosleep.expires);
        error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
@@ -1351,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
                /*
                 * Report back to the user the time still remaining.
                 */
-                if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                 tmp = timespec64_to_timespec(it.it_value);
+                if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
                        return -EFAULT;
-                restart_block->nanosleep.expires = timespec_to_ns(&t);
+                restart_block->nanosleep.expires = timespec64_to_ns(&t);
        }
        return error;
@@ -1364,12 +1373,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 #define THREAD_CLOCK    MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 static int process_cpu_clock_getres(const clockid_t which_clock,
-                                    struct timespec *tp)
+                                    struct timespec64 *tp)
 {
        return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
 static int process_cpu_clock_get(const clockid_t which_clock,
-                                 struct timespec *tp)
+                                 struct timespec64 *tp)
 {
        return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1379,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer)
        return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
-                              struct timespec *rqtp,
+                              struct timespec64 *rqtp,
                              struct timespec __user *rmtp)
 {
        return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
@@ -1389,12 +1398,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block)
        return -EINVAL;
 }
 static int thread_cpu_clock_getres(const clockid_t which_clock,
-                                   struct timespec *tp)
+                                   struct timespec64 *tp)
 {
        return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
 static int thread_cpu_clock_get(const clockid_t which_clock,
-                                struct timespec *tp)
+                                struct timespec64 *tp)
 {
        return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index cd6716e115e8..c0cd53eb018a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -49,26 +49,32 @@ SYS_NI(alarm);
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct timespec __user *, tp)
 {
+        struct timespec64 new_tp64;
        struct timespec new_tp;
        if (which_clock != CLOCK_REALTIME)
                return -EINVAL;
        if (copy_from_user(&new_tp, tp, sizeof (*tp)))
                return -EFAULT;
-        return do_sys_settimeofday(&new_tp, NULL);
+        new_tp64 = timespec_to_timespec64(new_tp);
+        return do_sys_settimeofday64(&new_tp64, NULL);
 }
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct timespec __user *,tp)
 {
+        struct timespec64 kernel_tp64;
        struct timespec kernel_tp;
        switch (which_clock) {
-        case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break;
+        case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
-        case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break;
+        case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
-        case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break;
+        case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
        default: return -EINVAL;
        }
+        kernel_tp = timespec64_to_timespec(kernel_tp64);
        if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
                return -EFAULT;
        return 0;
@@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                const struct timespec __user *, rqtp,
                struct timespec __user *, rmtp)
 {
+        struct timespec64 t64;
        struct timespec t;
        switch (which_clock) {
@@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
        case CLOCK_BOOTTIME:
                if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
                        return -EFAULT;
-                if (!timespec_valid(&t))
+                t64 = timespec_to_timespec64(t);
+                if (!timespec64_valid(&t64))
                        return -EINVAL;
-                return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ?
+                return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
                                         HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
                                         which_clock);
        default:
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 50a6a47020de..4d7b2ce09c27 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -130,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 /*
 * These ones are defined below.
 */
-static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
                         struct timespec __user *rmtp);
 static int common_timer_create(struct k_itimer *new_timer);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
 static int common_timer_set(struct k_itimer *, int,
-                            struct itimerspec *, struct itimerspec *);
+                            struct itimerspec64 *, struct itimerspec64 *);
 static int common_timer_del(struct k_itimer *timer);
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
@@ -204,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 }
 /* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
 {
-        ktime_get_real_ts(tp);
+        ktime_get_real_ts64(tp);
        return 0;
 }
 /* Set clock_realtime */
 static int posix_clock_realtime_set(const clockid_t which_clock,
-                                    const struct timespec *tp)
+                                    const struct timespec64 *tp)
 {
-        return do_sys_settimeofday(tp, NULL);
+        return do_sys_settimeofday64(tp, NULL);
 }
 static int posix_clock_realtime_adj(const clockid_t which_clock,
@@ -226,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
 /*
 * Get monotonic time for posix timers
 */
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
 {
-        ktime_get_ts(tp);
+        ktime_get_ts64(tp);
        return 0;
 }
 /*
 * Get monotonic-raw time for posix timers
 */
-static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
 {
-        getrawmonotonic(tp);
+        getrawmonotonic64(tp);
        return 0;
 }
-static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
 {
-        *tp = current_kernel_time();
+        *tp = current_kernel_time64();
        return 0;
 }
 static int posix_get_monotonic_coarse(clockid_t which_clock,
-                                                struct timespec *tp)
+                                                struct timespec64 *tp)
 {
-        *tp = get_monotonic_coarse();
+        *tp = get_monotonic_coarse64();
        return 0;
 }
-static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
 {
-        *tp = ktime_to_timespec(KTIME_LOW_RES);
+        *tp = ktime_to_timespec64(KTIME_LOW_RES);
        return 0;
 }
-static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
 {
-        get_monotonic_boottime(tp);
+        get_monotonic_boottime64(tp);
        return 0;
 }
-static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
 {
-        timekeeping_clocktai(tp);
+        timekeeping_clocktai64(tp);
        return 0;
 }
-static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 {
        tp->tv_sec = 0;
        tp->tv_nsec = hrtimer_resolution;
@@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 * report.
 */
 static void
-common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
        ktime_t now, remaining, iv;
        struct hrtimer *timer = &timr->it.real.timer;
-        memset(cur_setting, 0, sizeof(struct itimerspec));
+        memset(cur_setting, 0, sizeof(*cur_setting));
        iv = timr->it.real.interval;
        /* interval timer ? */
        if (iv)
-                cur_setting->it_interval = ktime_to_timespec(iv);
+                cur_setting->it_interval = ktime_to_timespec64(iv);
        else if (!hrtimer_active(timer) &&
                 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
                return;
@@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
                if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
                        cur_setting->it_value.tv_nsec = 1;
        } else
-                cur_setting->it_value = ktime_to_timespec(remaining);
+                cur_setting->it_value = ktime_to_timespec64(remaining);
 }
 /* Get the time remaining on a POSIX.1b interval timer. */
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
                struct itimerspec __user *, setting)
 {
+        struct itimerspec64 cur_setting64;
        struct itimerspec cur_setting;
        struct k_itimer *timr;
        struct k_clock *kc;
@@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
        if (WARN_ON_ONCE(!kc || !kc->timer_get))
                ret = -EINVAL;
        else
-                kc->timer_get(timr, &cur_setting);
+                kc->timer_get(timr, &cur_setting64);
        unlock_timer(timr, flags);
+        cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
        if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
                return -EFAULT;
@@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 /* timr->it_lock is taken. */
 static int
 common_timer_set(struct k_itimer *timr, int flags,
-                 struct itimerspec *new_setting, struct itimerspec *old_setting)
+                 struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
 {
        struct hrtimer *timer = &timr->it.real.timer;
        enum hrtimer_mode mode;
@@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags,
        hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
        timr->it.real.timer.function = posix_timer_fn;
-        hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
+        hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
        /* Convert interval */
-        timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+        timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
        /* SIGEV_NONE timers are not queued ! See common_timer_get */
        if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
@@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
                const struct itimerspec __user *, new_setting,
                struct itimerspec __user *, old_setting)
 {
-        struct k_itimer *timr;
+        struct itimerspec64 new_spec64, old_spec64;
+        struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
        struct itimerspec new_spec, old_spec;
-        int error = 0;
+        struct k_itimer *timr;
        unsigned long flag;
-        struct itimerspec *rtn = old_setting ? &old_spec : NULL;
        struct k_clock *kc;
+        int error = 0;
        if (!new_setting)
                return -EINVAL;
        if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
                return -EFAULT;
+        new_spec64 = itimerspec_to_itimerspec64(&new_spec);
-        if (!timespec_valid(&new_spec.it_interval) ||
+        if (!timespec64_valid(&new_spec64.it_interval) ||
-            !timespec_valid(&new_spec.it_value))
+            !timespec64_valid(&new_spec64.it_value))
                return -EINVAL;
 retry:
        timr = lock_timer(timer_id, &flag);
@@ -908,7 +912,7 @@ retry:
        if (WARN_ON_ONCE(!kc || !kc->timer_set))
                error = -EINVAL;
        else
-                error = kc->timer_set(timr, flags, &new_spec, rtn);
+                error = kc->timer_set(timr, flags, &new_spec64, rtn);
        unlock_timer(timr, flag);
        if (error == TIMER_RETRY) {
@@ -916,6 +920,7 @@ retry:
                goto retry;
        }
+        old_spec = itimerspec64_to_itimerspec(&old_spec64);
        if (old_setting && !error &&
            copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
                error = -EFAULT;
@@ -1014,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct timespec __user *, tp)
 {
        struct k_clock *kc = clockid_to_kclock(which_clock);
+        struct timespec64 new_tp64;
        struct timespec new_tp;
        if (!kc || !kc->clock_set)
@@ -1021,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
        if (copy_from_user(&new_tp, tp, sizeof (*tp)))
                return -EFAULT;
+        new_tp64 = timespec_to_timespec64(new_tp);
-        return kc->clock_set(which_clock, &new_tp);
+        return kc->clock_set(which_clock, &new_tp64);
 }
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct timespec __user *,tp)
 {
        struct k_clock *kc = clockid_to_kclock(which_clock);
+        struct timespec64 kernel_tp64;
        struct timespec kernel_tp;
        int error;
        if (!kc)
                return -EINVAL;
-        error = kc->clock_get(which_clock, &kernel_tp);
+        error = kc->clock_get(which_clock, &kernel_tp64);
+        kernel_tp = timespec64_to_timespec(kernel_tp64);
        if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
                error = -EFAULT;
@@ -1070,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
                struct timespec __user *, tp)
 {
        struct k_clock *kc = clockid_to_kclock(which_clock);
+        struct timespec64 rtn_tp64;
        struct timespec rtn_tp;
        int error;
        if (!kc)
                return -EINVAL;
-        error = kc->clock_getres(which_clock, &rtn_tp);
+        error = kc->clock_getres(which_clock, &rtn_tp64);
+        rtn_tp = timespec64_to_timespec(rtn_tp64);
        if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
                error = -EFAULT;
@@ -1088,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 * nanosleep for monotonic and realtime clocks
 */
 static int common_nsleep(const clockid_t which_clock, int flags,
-                         struct timespec *tsave, struct timespec __user *rmtp)
+                         struct timespec64 *tsave, struct timespec __user *rmtp)
 {
        return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
@@ -1100,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                struct timespec __user *, rmtp)
 {
        struct k_clock *kc = clockid_to_kclock(which_clock);
+        struct timespec64 t64;
        struct timespec t;
        if (!kc)
@@ -1110,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
        if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
                return -EFAULT;
-        if (!timespec_valid(&t))
+        t64 = timespec_to_timespec64(t);
+        if (!timespec64_valid(&t64))
                return -EINVAL;
-        return kc->nsleep(which_clock, flags, &t, rmtp);
+        return kc->nsleep(which_clock, flags, &t64, rmtp);
 }
 /*
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index ea6b610c4c57..2d8f05aad442 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
        update_clock_read_data(&rd);
+        if (sched_clock_timer.function != NULL) {
+                /* update timeout for clock wrap */
+                hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+        }
        r = rate;
        if (r >= 4000000) {
                r /= 1000000;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7fe53be86077..64c97fc130c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void)
        return ts->sleep_length;
 }
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+        return ts->idle_calls;
+}
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 25bdd2504571..49c73c6ed648 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
                struct timezone __user *, tz)
 {
+        struct timespec64 new_ts;
        struct timeval user_tv;
-        struct timespec new_ts;
        struct timezone new_tz;
        if (tv) {
@@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
                        return -EFAULT;
        }
-        return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
@@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
        return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
 }
-/**
- * current_fs_time - Return FS time
- * @sb: Superblock.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs.
- */
-struct timespec current_fs_time(struct super_block *sb)
-{
-        struct timespec now = current_kernel_time();
-        return timespec_trunc(now, sb->s_time_gran);
-}
-EXPORT_SYMBOL(current_fs_time);
 /*
 * Convert jiffies to milliseconds and back.
 *
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b63a2102c29..9652bc57fd09 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
                return 0;
        /* Interpolate shortest distance from beginning or end of history */
-        interp_forward = partial_history_cycles > total_history_cycles/2 ?
+        interp_forward = partial_history_cycles > total_history_cycles / 2;
-                true : false;
        partial_history_cycles = interp_forward ?
                total_history_cycles - partial_history_cycles :
                partial_history_cycles;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1dc0256bfb6e..152a706ef8b8 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -241,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
        int ret;
        mutex_lock(&mutex);
-        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration(false);
        mutex_unlock(&mutex);
@@ -1120,7 +1120,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 EXPORT_SYMBOL_GPL(add_timer_on);
 /**
- * del_timer - deactive a timer.
+ * del_timer - deactivate a timer.
 * @timer: the timer to be deactivated
 *
 * del_timer() deactivates a timer - this works on both active and inactive
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ff8d5c13d04b..0e7f5428a148 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
+#include <linux/nmi.h>
 #include <linux/uaccess.h>
@@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 next_one:
        i = 0;
+        touch_nmi_watchdog();
        raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
        curr = timerqueue_getnext(&base->active);
@@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 {
        struct clock_event_device *dev = td->evtdev;
+        touch_nmi_watchdog();
        SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);
        if (cpu < 0)
                SEQ_printf(m, "Broadcast device\n");
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4a06e714645..7e06f04e98fe 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -134,7 +134,8 @@ config FUNCTION_TRACER
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
-        select GLOB
+        select GLOB
+        select TASKS_RCU if PREEMPT
        help
          Enable the kernel to trace every kernel function. This is done
          by using a compiler feature to insert a small, 5-byte No-Operation
@@ -455,7 +456,7 @@ config UPROBE_EVENTS
        select UPROBES
        select PROBE_EVENTS
        select TRACING
-        default n
+        default y
        help
          This allows the user to add tracing events on top of userspace
          dynamic events (similar to tracepoints) on the fly via the trace
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2058a7f94bd..193c5f5e3f79 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
 /**
 * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:          queue the io is for
 * @rq:         the source request
+ * @error:      return status to log
 * @nr_bytes:   number of completed bytes
 * @what:       the action
 *
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
 *     Records an action against a request. Will log the bio offset + size.
 *
 **/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+static void blk_add_trace_rq(struct request *rq, int error,
                             unsigned int nr_bytes, u32 what)
 {
-        struct blk_trace *bt = q->blk_trace;
+        struct blk_trace *bt = rq->q->blk_trace;
        if (likely(!bt))
                return;
@@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
                what |= BLK_TC_ACT(BLK_TC_FS);
        __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-                        rq->cmd_flags, what, rq->errors, 0, NULL);
+                        rq->cmd_flags, what, error, 0, NULL);
-}
-static void blk_add_trace_rq_abort(void *ignore,
-                                   struct request_queue *q, struct request *rq)
-{
-        blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
 }
 static void blk_add_trace_rq_insert(void *ignore,
                                    struct request_queue *q, struct request *rq)
 {
-        blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
 }
 static void blk_add_trace_rq_issue(void *ignore,
                                   struct request_queue *q, struct request *rq)
 {
-        blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
 }
 static void blk_add_trace_rq_requeue(void *ignore,
                                     struct request_queue *q,
                                     struct request *rq)
 {
-        blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
 }
-static void blk_add_trace_rq_complete(void *ignore,
+static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
-                                      struct request_queue *q,
+                        int error, unsigned int nr_bytes)
-                                      struct request *rq,
-                                      unsigned int nr_bytes)
 {
-        blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+        blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
 }
 /**
@@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
        r.sector_from = cpu_to_be64(from);
        __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-                        rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
+                        rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
                        sizeof(r), &r);
 }
@@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
                return;
        __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
-                                BLK_TA_DRV_DATA, rq->errors, len, data);
+                                BLK_TA_DRV_DATA, 0, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -974,8 +966,6 @@ static void blk_register_tracepoints(void)
 {
        int ret;
-        ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
-        WARN_ON(ret);
        ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
@@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
        unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
        unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
        unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
-        unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
        tracepoint_synchronize_unregister();
 }
@@ -1673,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
                goto out;
        if (attr == &dev_attr_act_mask) {
-                if (sscanf(buf, "%llx", &value) != 1) {
+                if (kstrtoull(buf, 0, &value)) {
                        /* Assume it is a list of trace category names */
                        ret = blk_trace_str2mask(buf);
                        if (ret < 0)
                                goto out;
                        value = ret;
                }
-        } else if (sscanf(buf, "%llu", &value) != 1)
+        } else if (kstrtoull(buf, 0, &value))
                goto out;
        ret = -ENXIO;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cee9802cf3e0..460a031c77e5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
        if (unlikely(in_interrupt() ||
                     current->flags & (PF_KTHREAD | PF_EXITING)))
                return -EPERM;
-        if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+        if (unlikely(uaccess_kernel()))
                return -EPERM;
        if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
                return -EPERM;
@@ -501,16 +501,11 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
        return true;
 }
-static const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_prog_ops = {
        .get_func_proto  = kprobe_prog_func_proto,
        .is_valid_access = kprobe_prog_is_valid_access,
 };
-static struct bpf_prog_type_list kprobe_tl __ro_after_init = {
-        .ops    = &kprobe_prog_ops,
-        .type   = BPF_PROG_TYPE_KPROBE,
-};
 BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
 {
@@ -584,16 +579,11 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
        return true;
 }
-static const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_prog_ops = {
        .get_func_proto  = tp_prog_func_proto,
        .is_valid_access = tp_prog_is_valid_access,
 };
-static struct bpf_prog_type_list tracepoint_tl __ro_after_init = {
-        .ops    = &tracepoint_prog_ops,
-        .type   = BPF_PROG_TYPE_TRACEPOINT,
-};
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
                                    enum bpf_reg_type *reg_type)
 {
@@ -642,22 +632,8 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
        return insn - insn_buf;
 }
-static const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_prog_ops = {
        .get_func_proto         = tp_prog_func_proto,
        .is_valid_access        = pe_prog_is_valid_access,
        .convert_ctx_access     = pe_prog_convert_ctx_access,
 };
-static struct bpf_prog_type_list perf_event_tl __ro_after_init = {
-        .ops    = &perf_event_prog_ops,
-        .type   = BPF_PROG_TYPE_PERF_EVENT,
-};
-static int __init register_kprobe_prog_ops(void)
-{
-        bpf_register_prog_type(&kprobe_tl);
-        bpf_register_prog_type(&tracepoint_tl);
-        bpf_register_prog_type(&perf_event_tl);
-        return 0;
-}
-late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dd3e91d68dc7..74fdfe9ed3db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -36,6 +36,7 @@
 #include <trace/events/sched.h>
+#include <asm/sections.h>
 #include <asm/setup.h>
 #include "trace_output.h"
@@ -1095,22 +1096,20 @@ static bool update_all_ops;
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-struct ftrace_func_probe {
-        struct hlist_node       node;
-        struct ftrace_probe_ops *ops;
-        unsigned long           flags;
-        unsigned long           ip;
-        void                    *data;
-        struct list_head        free_list;
-};
 struct ftrace_func_entry {
        struct hlist_node hlist;
        unsigned long ip;
 };
+struct ftrace_func_probe {
+        struct ftrace_probe_ops *probe_ops;
+        struct ftrace_ops       ops;
+        struct trace_array      *tr;
+        struct list_head        list;
+        void                    *data;
+        int                     ref;
+};
 /*
 * We make these constant because no one should touch them,
 * but they are used as the default "empty hash", to avoid allocating
@@ -1271,7 +1270,7 @@ static void
 remove_hash_entry(struct ftrace_hash *hash,
                  struct ftrace_func_entry *entry)
 {
-        hlist_del(&entry->hlist);
+        hlist_del_rcu(&entry->hlist);
        hash->count--;
 }
@@ -2807,18 +2806,28 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
         * callers are done before leaving this function.
         * The same goes for freeing the per_cpu data of the per_cpu
         * ops.
-         *
-         * Again, normal synchronize_sched() is not good enough.
-         * We need to do a hard force of sched synchronization.
-         * This is because we use preempt_disable() to do RCU, but
-         * the function tracers can be called where RCU is not watching
-         * (like before user_exit()). We can not rely on the RCU
-         * infrastructure to do the synchronization, thus we must do it
-         * ourselves.
         */
        if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+                /*
+                 * We need to do a hard force of sched synchronization.
+                 * This is because we use preempt_disable() to do RCU, but
+                 * the function tracers can be called where RCU is not watching
+                 * (like before user_exit()). We can not rely on the RCU
+                 * infrastructure to do the synchronization, thus we must do it
+                 * ourselves.
+                 */
                schedule_on_each_cpu(ftrace_sync);
+                /*
+                 * When the kernel is preeptive, tasks can be preempted
+                 * while on a ftrace trampoline. Just scheduling a task on
+                 * a CPU is not good enough to flush them. Calling
+                 * synchornize_rcu_tasks() will wait for those tasks to
+                 * execute and either schedule voluntarily or enter user space.
+                 */
+                if (IS_ENABLED(CONFIG_PREEMPT))
+                        synchronize_rcu_tasks();
                arch_ftrace_trampoline_free(ops);
                if (ops->flags & FTRACE_OPS_FL_PER_CPU)
@@ -3055,34 +3064,63 @@ struct ftrace_iterator {
        struct ftrace_page              *pg;
        struct dyn_ftrace               *func;
        struct ftrace_func_probe        *probe;
+        struct ftrace_func_entry        *probe_entry;
        struct trace_parser             parser;
        struct ftrace_hash              *hash;
        struct ftrace_ops               *ops;
-        int                             hidx;
+        int                             pidx;
        int                             idx;
        unsigned                        flags;
 };
 static void *
-t_hash_next(struct seq_file *m, loff_t *pos)
+t_probe_next(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
+        struct trace_array *tr = iter->ops->private;
+        struct list_head *func_probes;
+        struct ftrace_hash *hash;
+        struct list_head *next;
        struct hlist_node *hnd = NULL;
        struct hlist_head *hhd;
+        int size;
        (*pos)++;
        iter->pos = *pos;
-        if (iter->probe)
+        if (!tr)
-                hnd = &iter->probe->node;
- retry:
-        if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
                return NULL;
-        hhd = &ftrace_func_hash[iter->hidx];
+        func_probes = &tr->func_probes;
+        if (list_empty(func_probes))
+                return NULL;
+        if (!iter->probe) {
+                next = func_probes->next;
+                iter->probe = list_entry(next, struct ftrace_func_probe, list);
+        }
+        if (iter->probe_entry)
+                hnd = &iter->probe_entry->hlist;
+        hash = iter->probe->ops.func_hash->filter_hash;
+        size = 1 << hash->size_bits;
+ retry:
+        if (iter->pidx >= size) {
+                if (iter->probe->list.next == func_probes)
+                        return NULL;
+                next = iter->probe->list.next;
+                iter->probe = list_entry(next, struct ftrace_func_probe, list);
+                hash = iter->probe->ops.func_hash->filter_hash;
+                size = 1 << hash->size_bits;
+                iter->pidx = 0;
+        }
+        hhd = &hash->buckets[iter->pidx];
        if (hlist_empty(hhd)) {
-                iter->hidx++;
+                iter->pidx++;
                hnd = NULL;
                goto retry;
        }
@@ -3092,7 +3130,7 @@ t_hash_next(struct seq_file *m, loff_t *pos)
        else {
                hnd = hnd->next;
                if (!hnd) {
-                        iter->hidx++;
+                        iter->pidx++;
                        goto retry;
                }
        }
@@ -3100,26 +3138,28 @@ t_hash_next(struct seq_file *m, loff_t *pos)
        if (WARN_ON_ONCE(!hnd))
                return NULL;
-        iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+        iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist);
        return iter;
 }
-static void *t_hash_start(struct seq_file *m, loff_t *pos)
+static void *t_probe_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
        void *p = NULL;
        loff_t l;
-        if (!(iter->flags & FTRACE_ITER_DO_HASH))
+        if (!(iter->flags & FTRACE_ITER_DO_PROBES))
                return NULL;
        if (iter->func_pos > *pos)
                return NULL;
-        iter->hidx = 0;
+        iter->probe = NULL;
+        iter->probe_entry = NULL;
+        iter->pidx = 0;
        for (l = 0; l <= (*pos - iter->func_pos); ) {
-                p = t_hash_next(m, &l);
+                p = t_probe_next(m, &l);
                if (!p)
                        break;
        }
@@ -3127,50 +3167,42 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
                return NULL;
        /* Only set this if we have an item */
-        iter->flags |= FTRACE_ITER_HASH;
+        iter->flags |= FTRACE_ITER_PROBE;
        return iter;
 }
 static int
-t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
+t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
 {
-        struct ftrace_func_probe *rec;
+        struct ftrace_func_entry *probe_entry;
+        struct ftrace_probe_ops *probe_ops;
+        struct ftrace_func_probe *probe;
+        probe = iter->probe;
+        probe_entry = iter->probe_entry;
-        rec = iter->probe;
+        if (WARN_ON_ONCE(!probe || !probe_entry))
-        if (WARN_ON_ONCE(!rec))
                return -EIO;
-        if (rec->ops->print)
+        probe_ops = probe->probe_ops;
-                return rec->ops->print(m, rec->ip, rec->ops, rec->data);
-        seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
+        if (probe_ops->print)
+                return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data);
-        if (rec->data)
+        seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip,
-                seq_printf(m, ":%p", rec->data);
+                   (void *)probe_ops->func);
-        seq_putc(m, '\n');
        return 0;
 }
 static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+t_func_next(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct ftrace_ops *ops = iter->ops;
        struct dyn_ftrace *rec = NULL;
-        if (unlikely(ftrace_disabled))
-                return NULL;
-        if (iter->flags & FTRACE_ITER_HASH)
-                return t_hash_next(m, pos);
        (*pos)++;
-        iter->pos = iter->func_pos = *pos;
-        if (iter->flags & FTRACE_ITER_PRINTALL)
-                return t_hash_start(m, pos);
 retry:
        if (iter->idx >= iter->pg->index) {
@@ -3181,11 +3213,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                }
        } else {
                rec = &iter->pg->records[iter->idx++];
-                if (((iter->flags & FTRACE_ITER_FILTER) &&
+                if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) &&
-                     !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) ||
+                     !ftrace_lookup_ip(iter->hash, rec->ip)) ||
-                    ((iter->flags & FTRACE_ITER_NOTRACE) &&
-                     !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
                    ((iter->flags & FTRACE_ITER_ENABLED) &&
                     !(rec->flags & FTRACE_FL_ENABLED))) {
@@ -3196,24 +3225,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
        }
        if (!rec)
-                return t_hash_start(m, pos);
+                return NULL;
+        iter->pos = iter->func_pos = *pos;
        iter->func = rec;
        return iter;
 }
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct ftrace_iterator *iter = m->private;
+        loff_t l = *pos; /* t_hash_start() must use original pos */
+        void *ret;
+        if (unlikely(ftrace_disabled))
+                return NULL;
+        if (iter->flags & FTRACE_ITER_PROBE)
+                return t_probe_next(m, pos);
+        if (iter->flags & FTRACE_ITER_PRINTALL) {
+                /* next must increment pos, and t_probe_start does not */
+                (*pos)++;
+                return t_probe_start(m, &l);
+        }
+        ret = t_func_next(m, pos);
+        if (!ret)
+                return t_probe_start(m, &l);
+        return ret;
+}
 static void reset_iter_read(struct ftrace_iterator *iter)
 {
        iter->pos = 0;
        iter->func_pos = 0;
-        iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
+        iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE);
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct ftrace_ops *ops = iter->ops;
        void *p = NULL;
        loff_t l;
@@ -3233,20 +3289,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         * off, we can short cut and just print out that all
         * functions are enabled.
         */
-        if ((iter->flags & FTRACE_ITER_FILTER &&
+        if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) &&
-             ftrace_hash_empty(ops->func_hash->filter_hash)) ||
+            ftrace_hash_empty(iter->hash)) {
-            (iter->flags & FTRACE_ITER_NOTRACE &&
+                iter->func_pos = 1; /* Account for the message */
-             ftrace_hash_empty(ops->func_hash->notrace_hash))) {
                if (*pos > 0)
-                        return t_hash_start(m, pos);
+                        return t_probe_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
                /* reset in case of seek/pread */
-                iter->flags &= ~FTRACE_ITER_HASH;
+                iter->flags &= ~FTRACE_ITER_PROBE;
                return iter;
        }
-        if (iter->flags & FTRACE_ITER_HASH)
+        if (iter->flags & FTRACE_ITER_PROBE)
-                return t_hash_start(m, pos);
+                return t_probe_start(m, pos);
        /*
         * Unfortunately, we need to restart at ftrace_pages_start
@@ -3256,13 +3311,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
        iter->pg = ftrace_pages_start;
        iter->idx = 0;
        for (l = 0; l <= *pos; ) {
-                p = t_next(m, p, &l);
+                p = t_func_next(m, &l);
                if (!p)
                        break;
        }
        if (!p)
-                return t_hash_start(m, pos);
+                return t_probe_start(m, pos);
        return iter;
 }
@@ -3293,8 +3348,8 @@ static int t_show(struct seq_file *m, void *v)
        struct ftrace_iterator *iter = m->private;
        struct dyn_ftrace *rec;
-        if (iter->flags & FTRACE_ITER_HASH)
+        if (iter->flags & FTRACE_ITER_PROBE)
-                return t_hash_show(m, iter);
+                return t_probe_show(m, iter);
        if (iter->flags & FTRACE_ITER_PRINTALL) {
                if (iter->flags & FTRACE_ITER_NOTRACE)
@@ -3355,12 +3410,13 @@ ftrace_avail_open(struct inode *inode, struct file *file)
                return -ENODEV;
        iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
-        if (iter) {
+        if (!iter)
-                iter->pg = ftrace_pages_start;
+                return -ENOMEM;
-                iter->ops = &global_ops;
-        }
+        iter->pg = ftrace_pages_start;
+        iter->ops = &global_ops;
-        return iter ? 0 : -ENOMEM;
+        return 0;
 }
 static int
@@ -3369,13 +3425,14 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
        struct ftrace_iterator *iter;
        iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
-        if (iter) {
+        if (!iter)
-                iter->pg = ftrace_pages_start;
+                return -ENOMEM;
-                iter->flags = FTRACE_ITER_ENABLED;
-                iter->ops = &global_ops;
+        iter->pg = ftrace_pages_start;
-        }
+        iter->flags = FTRACE_ITER_ENABLED;
+        iter->ops = &global_ops;
-        return iter ? 0 : -ENOMEM;
+        return 0;
 }
 /**
@@ -3440,7 +3497,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
                        ret = -ENOMEM;
                        goto out_unlock;
                }
-        }
+        } else
+                iter->hash = hash;
        if (file->f_mode & FMODE_READ) {
                iter->pg = ftrace_pages_start;
@@ -3470,7 +3528,7 @@ ftrace_filter_open(struct inode *inode, struct file *file)
        struct ftrace_ops *ops = inode->i_private;
        return ftrace_regex_open(ops,
-                        FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+                        FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES,
                        inode, file);
 }
@@ -3573,22 +3631,20 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
                /* blank module name to match all modules */
                if (!mod_g->len) {
                        /* blank module globbing: modname xor exclude_mod */
-                        if ((!exclude_mod) != (!modname))
+                        if (!exclude_mod != !modname)
                                goto func_match;
                        return 0;
                }
-                /* not matching the module */
+                /*
-                if (!modname || !mod_matches) {
+                 * exclude_mod is set to trace everything but the given
-                        if (exclude_mod)
+                 * module. If it is set and the module matches, then
-                                goto func_match;
+                 * return 0. If it is not set, and the module doesn't match
-                        else
+                 * also return 0. Otherwise, check the function to see if
-                                return 0;
+                 * that matches.
-                }
+                 */
+                if (!mod_matches == !exclude_mod)
-                if (mod_matches && exclude_mod)
                        return 0;
 func_match:
                /* blank search means to match all funcs in the mod */
                if (!func_g->len)
@@ -3654,6 +3710,56 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
        return match_records(hash, buff, len, NULL);
 }
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+                                   struct ftrace_ops_hash *old_hash)
+{
+        struct ftrace_ops *op;
+        if (!ftrace_enabled)
+                return;
+        if (ops->flags & FTRACE_OPS_FL_ENABLED) {
+                ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
+                return;
+        }
+        /*
+         * If this is the shared global_ops filter, then we need to
+         * check if there is another ops that shares it, is enabled.
+         * If so, we still need to run the modify code.
+         */
+        if (ops->func_hash != &global_ops.local_hash)
+                return;
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                if (op->func_hash == &global_ops.local_hash &&
+                    op->flags & FTRACE_OPS_FL_ENABLED) {
+                        ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
+                        /* Only need to do this once */
+                        return;
+                }
+        } while_for_each_ftrace_op(op);
+}
+static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
+                                           struct ftrace_hash **orig_hash,
+                                           struct ftrace_hash *hash,
+                                           int enable)
+{
+        struct ftrace_ops_hash old_hash_ops;
+        struct ftrace_hash *old_hash;
+        int ret;
+        old_hash = *orig_hash;
+        old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+        old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
+        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+        if (!ret) {
+                ftrace_ops_update_code(ops, &old_hash_ops);
+                free_ftrace_hash_rcu(old_hash);
+        }
+        return ret;
+}
 /*
 * We register the module command as a template to show others how
@@ -3661,7 +3767,7 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
 */
 static int
-ftrace_mod_callback(struct ftrace_hash *hash,
+ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
                    char *func, char *cmd, char *module, int enable)
 {
        int ret;
@@ -3695,16 +3801,11 @@ core_initcall(ftrace_mod_cmd_init);
 static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
                                      struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
-        struct ftrace_func_probe *entry;
+        struct ftrace_probe_ops *probe_ops;
-        struct hlist_head *hhd;
+        struct ftrace_func_probe *probe;
-        unsigned long key;
-        key = hash_long(ip, FTRACE_HASH_BITS);
+        probe = container_of(op, struct ftrace_func_probe, ops);
+        probe_ops = probe->probe_ops;
-        hhd = &ftrace_func_hash[key];
-        if (hlist_empty(hhd))
-                return;
        /*
         * Disable preemption for these calls to prevent a RCU grace
@@ -3712,213 +3813,340 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
         * on the hash. rcu_read_lock is too dangerous here.
         */
        preempt_disable_notrace();
-        hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
+        probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data);
-                if (entry->ip == ip)
-                        entry->ops->func(ip, parent_ip, &entry->data);
-        }
        preempt_enable_notrace();
 }
-static struct ftrace_ops trace_probe_ops __read_mostly =
+struct ftrace_func_map {
-{
+        struct ftrace_func_entry        entry;
-        .func           = function_trace_probe_call,
+        void                            *data;
-        .flags          = FTRACE_OPS_FL_INITIALIZED,
-        INIT_OPS_HASH(trace_probe_ops)
 };
-static int ftrace_probe_registered;
+struct ftrace_func_mapper {
+        struct ftrace_hash              hash;
+};
-static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
+/**
+ * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper
+ *
+ * Returns a ftrace_func_mapper descriptor that can be used to map ips to data.
+ */
+struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
 {
-        int ret;
+        struct ftrace_hash *hash;
-        int i;
-        if (ftrace_probe_registered) {
+        /*
-                /* still need to update the function call sites */
+         * The mapper is simply a ftrace_hash, but since the entries
-                if (ftrace_enabled)
+         * in the hash are not ftrace_func_entry type, we define it
-                        ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+         * as a separate structure.
-                                               old_hash);
+         */
-                return;
+        hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
-        }
+        return (struct ftrace_func_mapper *)hash;
+}
-        for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+/**
-                struct hlist_head *hhd = &ftrace_func_hash[i];
+ * ftrace_func_mapper_find_ip - Find some data mapped to an ip
-                if (hhd->first)
+ * @mapper: The mapper that has the ip maps
-                        break;
+ * @ip: the instruction pointer to find the data for
-        }
+ *
-        /* Nothing registered? */
+ * Returns the data mapped to @ip if found otherwise NULL. The return
-        if (i == FTRACE_FUNC_HASHSIZE)
+ * is actually the address of the mapper data pointer. The address is
-                return;
+ * returned for use cases where the data is no bigger than a long, and
+ * the user can use the data pointer as its data instead of having to
+ * allocate more memory for the reference.
+ */
+void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
+                                  unsigned long ip)
+{
+        struct ftrace_func_entry *entry;
+        struct ftrace_func_map *map;
-        ret = ftrace_startup(&trace_probe_ops, 0);
+        entry = ftrace_lookup_ip(&mapper->hash, ip);
+        if (!entry)
+                return NULL;
-        ftrace_probe_registered = 1;
+        map = (struct ftrace_func_map *)entry;
+        return &map->data;
 }
-static bool __disable_ftrace_function_probe(void)
+/**
+ * ftrace_func_mapper_add_ip - Map some data to an ip
+ * @mapper: The mapper that has the ip maps
+ * @ip: The instruction pointer address to map @data to
+ * @data: The data to map to @ip
+ *
+ * Returns 0 on succes otherwise an error.
+ */
+int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
+                              unsigned long ip, void *data)
 {
-        int i;
+        struct ftrace_func_entry *entry;
+        struct ftrace_func_map *map;
-        if (!ftrace_probe_registered)
+        entry = ftrace_lookup_ip(&mapper->hash, ip);
-                return false;
+        if (entry)
+                return -EBUSY;
-        for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+        map = kmalloc(sizeof(*map), GFP_KERNEL);
-                struct hlist_head *hhd = &ftrace_func_hash[i];
+        if (!map)
-                if (hhd->first)
+                return -ENOMEM;
-                        return false;
-        }
-        /* no more funcs left */
+        map->entry.ip = ip;
-        ftrace_shutdown(&trace_probe_ops, 0);
+        map->data = data;
-        ftrace_probe_registered = 0;
+        __add_hash_entry(&mapper->hash, &map->entry);
-        return true;
-}
+        return 0;
+}
-static void ftrace_free_entry(struct ftrace_func_probe *entry)
+/**
+ * ftrace_func_mapper_remove_ip - Remove an ip from the mapping
+ * @mapper: The mapper that has the ip maps
+ * @ip: The instruction pointer address to remove the data from
+ *
+ * Returns the data if it is found, otherwise NULL.
+ * Note, if the data pointer is used as the data itself, (see 
+ * ftrace_func_mapper_find_ip(), then the return value may be meaningless,
+ * if the data pointer was set to zero.
+ */
+void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper,
+                                   unsigned long ip)
 {
-        if (entry->ops->free)
+        struct ftrace_func_entry *entry;
-                entry->ops->free(entry->ops, entry->ip, &entry->data);
+        struct ftrace_func_map *map;
+        void *data;
+        entry = ftrace_lookup_ip(&mapper->hash, ip);
+        if (!entry)
+                return NULL;
+        map = (struct ftrace_func_map *)entry;
+        data = map->data;
+        remove_hash_entry(&mapper->hash, entry);
        kfree(entry);
+        return data;
+}
+/**
+ * free_ftrace_func_mapper - free a mapping of ips and data
+ * @mapper: The mapper that has the ip maps
+ * @free_func: A function to be called on each data item.
+ *
+ * This is used to free the function mapper. The @free_func is optional
+ * and can be used if the data needs to be freed as well.
+ */
+void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
+                             ftrace_mapper_func free_func)
+{
+        struct ftrace_func_entry *entry;
+        struct ftrace_func_map *map;
+        struct hlist_head *hhd;
+        int size = 1 << mapper->hash.size_bits;
+        int i;
+        if (free_func && mapper->hash.count) {
+                for (i = 0; i < size; i++) {
+                        hhd = &mapper->hash.buckets[i];
+                        hlist_for_each_entry(entry, hhd, hlist) {
+                                map = (struct ftrace_func_map *)entry;
+                                free_func(map);
+                        }
+                }
+        }
+        free_ftrace_hash(&mapper->hash);
+}
+static void release_probe(struct ftrace_func_probe *probe)
+{
+        struct ftrace_probe_ops *probe_ops;
+        mutex_lock(&ftrace_lock);
+        WARN_ON(probe->ref <= 0);
+        /* Subtract the ref that was used to protect this instance */
+        probe->ref--;
+        if (!probe->ref) {
+                probe_ops = probe->probe_ops;
+                /*
+                 * Sending zero as ip tells probe_ops to free
+                 * the probe->data itself
+                 */
+                if (probe_ops->free)
+                        probe_ops->free(probe_ops, probe->tr, 0, probe->data);
+                list_del(&probe->list);
+                kfree(probe);
+        }
+        mutex_unlock(&ftrace_lock);
+}
+static void acquire_probe_locked(struct ftrace_func_probe *probe)
+{
+        /*
+         * Add one ref to keep it from being freed when releasing the
+         * ftrace_lock mutex.
+         */
+        probe->ref++;
 }
 int
-register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+register_ftrace_function_probe(char *glob, struct trace_array *tr,
-                              void *data)
+                               struct ftrace_probe_ops *probe_ops,
+                               void *data)
 {
-        struct ftrace_ops_hash old_hash_ops;
+        struct ftrace_func_entry *entry;
-        struct ftrace_func_probe *entry;
+        struct ftrace_func_probe *probe;
-        struct ftrace_glob func_g;
+        struct ftrace_hash **orig_hash;
-        struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+        struct ftrace_hash *old_hash;
-        struct ftrace_hash *old_hash = *orig_hash;
        struct ftrace_hash *hash;
-        struct ftrace_page *pg;
-        struct dyn_ftrace *rec;
-        int not;
-        unsigned long key;
        int count = 0;
+        int size;
        int ret;
+        int i;
-        func_g.type = filter_parse_regex(glob, strlen(glob),
+        if (WARN_ON(!tr))
-                        &func_g.search, &not);
-        func_g.len = strlen(func_g.search);
-        /* we do not support '!' for function probes */
-        if (WARN_ON(not))
                return -EINVAL;
-        mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+        /* We do not support '!' for function probes */
+        if (WARN_ON(glob[0] == '!'))
+                return -EINVAL;
-        old_hash_ops.filter_hash = old_hash;
-        /* Probes only have filters */
-        old_hash_ops.notrace_hash = NULL;
-        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
+        mutex_lock(&ftrace_lock);
-        if (!hash) {
+        /* Check if the probe_ops is already registered */
-                count = -ENOMEM;
+        list_for_each_entry(probe, &tr->func_probes, list) {
-                goto out;
+                if (probe->probe_ops == probe_ops)
+                        break;
        }
+        if (&probe->list == &tr->func_probes) {
-        if (unlikely(ftrace_disabled)) {
+                probe = kzalloc(sizeof(*probe), GFP_KERNEL);
-                count = -ENODEV;
+                if (!probe) {
-                goto out;
+                        mutex_unlock(&ftrace_lock);
+                        return -ENOMEM;
+                }
+                probe->probe_ops = probe_ops;
+                probe->ops.func = function_trace_probe_call;
+                probe->tr = tr;
+                ftrace_ops_init(&probe->ops);
+                list_add(&probe->list, &tr->func_probes);
        }
-        mutex_lock(&ftrace_lock);
+        acquire_probe_locked(probe);
-        do_for_each_ftrace_rec(pg, rec) {
+        mutex_unlock(&ftrace_lock);
-                if (rec->flags & FTRACE_FL_DISABLED)
+        mutex_lock(&probe->ops.func_hash->regex_lock);
-                        continue;
-                if (!ftrace_match_record(rec, &func_g, NULL, 0))
+        orig_hash = &probe->ops.func_hash->filter_hash;
-                        continue;
+        old_hash = *orig_hash;
+        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
-                entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+        ret = ftrace_match_records(hash, glob, strlen(glob));
-                if (!entry) {
-                        /* If we did not process any, then return error */
-                        if (!count)
-                                count = -ENOMEM;
-                        goto out_unlock;
-                }
-                count++;
+        /* Nothing found? */
+        if (!ret)
+                ret = -EINVAL;
-                entry->data = data;
+        if (ret < 0)
+                goto out;
-                /*
+        size = 1 << hash->size_bits;
-                 * The caller might want to do something special
+        for (i = 0; i < size; i++) {
-                 * for each function we find. We call the callback
+                hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
-                 * to give the caller an opportunity to do so.
+                        if (ftrace_lookup_ip(old_hash, entry->ip))
-                 */
-                if (ops->init) {
-                        if (ops->init(ops, rec->ip, &entry->data) < 0) {
-                                /* caller does not like this func */
-                                kfree(entry);
                                continue;
+                        /*
+                         * The caller might want to do something special
+                         * for each function we find. We call the callback
+                         * to give the caller an opportunity to do so.
+                         */
+                        if (probe_ops->init) {
+                                ret = probe_ops->init(probe_ops, tr,
+                                                      entry->ip, data,
+                                                      &probe->data);
+                                if (ret < 0) {
+                                        if (probe_ops->free && count)
+                                                probe_ops->free(probe_ops, tr,
+                                                                0, probe->data);
+                                        probe->data = NULL;
+                                        goto out;
+                                }
                        }
+                        count++;
                }
+        }
-                ret = enter_record(hash, rec, 0);
+        mutex_lock(&ftrace_lock);
-                if (ret < 0) {
-                        kfree(entry);
-                        count = ret;
-                        goto out_unlock;
-                }
-                entry->ops = ops;
-                entry->ip = rec->ip;
-                key = hash_long(entry->ip, FTRACE_HASH_BITS);
-                hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
-        } while_for_each_ftrace_rec();
+        if (!count) {
+                /* Nothing was added? */
+                ret = -EINVAL;
+                goto out_unlock;
+        }
-        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+        ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash,
+                                              hash, 1);
+        if (ret < 0)
+                goto err_unlock;
-        __enable_ftrace_function_probe(&old_hash_ops);
+        /* One ref for each new function traced */
+        probe->ref += count;
-        if (!ret)
+        if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED))
-                free_ftrace_hash_rcu(old_hash);
+                ret = ftrace_startup(&probe->ops, 0);
-        else
-                count = ret;
 out_unlock:
        mutex_unlock(&ftrace_lock);
+        if (!ret)
+                ret = count;
 out:
-        mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
+        mutex_unlock(&probe->ops.func_hash->regex_lock);
        free_ftrace_hash(hash);
-        return count;
+        release_probe(probe);
-}
-enum {
+        return ret;
-        PROBE_TEST_FUNC         = 1,
-        PROBE_TEST_DATA         = 2
-};
-static void
+ err_unlock:
-__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+        if (!probe_ops->free || !count)
-                                  void *data, int flags)
+                goto out_unlock;
+        /* Failed to do the move, need to call the free functions */
+        for (i = 0; i < size; i++) {
+                hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+                        if (ftrace_lookup_ip(old_hash, entry->ip))
+                                continue;
+                        probe_ops->free(probe_ops, tr, entry->ip, probe->data);
+                }
+        }
+        goto out_unlock;
+}
+int
+unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
+                                      struct ftrace_probe_ops *probe_ops)
 {
        struct ftrace_ops_hash old_hash_ops;
-        struct ftrace_func_entry *rec_entry;
+        struct ftrace_func_entry *entry;
-        struct ftrace_func_probe *entry;
+        struct ftrace_func_probe *probe;
-        struct ftrace_func_probe *p;
        struct ftrace_glob func_g;
-        struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+        struct ftrace_hash **orig_hash;
-        struct ftrace_hash *old_hash = *orig_hash;
+        struct ftrace_hash *old_hash;
-        struct list_head free_list;
+        struct ftrace_hash *hash = NULL;
-        struct ftrace_hash *hash;
        struct hlist_node *tmp;
+        struct hlist_head hhd;
        char str[KSYM_SYMBOL_LEN];
-        int i, ret;
+        int count = 0;
-        bool disabled;
+        int i, ret = -ENODEV;
+        int size;
-        if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
+        if (!glob || !strlen(glob) || !strcmp(glob, "*"))
                func_g.search = NULL;
-        else if (glob) {
+        else {
                int not;
                func_g.type = filter_parse_regex(glob, strlen(glob),
@@ -3928,95 +4156,112 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                /* we do not support '!' for function probes */
                if (WARN_ON(not))
-                        return;
+                        return -EINVAL;
        }
-        mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+        mutex_lock(&ftrace_lock);
+        /* Check if the probe_ops is already registered */
+        list_for_each_entry(probe, &tr->func_probes, list) {
+                if (probe->probe_ops == probe_ops)
+                        break;
+        }
+        if (&probe->list == &tr->func_probes)
+                goto err_unlock_ftrace;
+        ret = -EINVAL;
+        if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED))
+                goto err_unlock_ftrace;
+        acquire_probe_locked(probe);
+        mutex_unlock(&ftrace_lock);
+        mutex_lock(&probe->ops.func_hash->regex_lock);
+        orig_hash = &probe->ops.func_hash->filter_hash;
+        old_hash = *orig_hash;
+        if (ftrace_hash_empty(old_hash))
+                goto out_unlock;
        old_hash_ops.filter_hash = old_hash;
        /* Probes only have filters */
        old_hash_ops.notrace_hash = NULL;
-        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+        ret = -ENOMEM;
+        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
        if (!hash)
-                /* Hmm, should report this somehow */
                goto out_unlock;
-        INIT_LIST_HEAD(&free_list);
+        INIT_HLIST_HEAD(&hhd);
-        for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
-                struct hlist_head *hhd = &ftrace_func_hash[i];
-                hlist_for_each_entry_safe(entry, tmp, hhd, node) {
+        size = 1 << hash->size_bits;
+        for (i = 0; i < size; i++) {
-                        /* break up if statements for readability */
+                hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) {
-                        if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
-                                continue;
-                        if ((flags & PROBE_TEST_DATA) && entry->data != data)
-                                continue;
-                        /* do this last, since it is the most expensive */
                        if (func_g.search) {
                                kallsyms_lookup(entry->ip, NULL, NULL,
                                                NULL, str);
                                if (!ftrace_match(str, &func_g))
                                        continue;
                        }
+                        count++;
-                        rec_entry = ftrace_lookup_ip(hash, entry->ip);
+                        remove_hash_entry(hash, entry);
-                        /* It is possible more than one entry had this ip */
+                        hlist_add_head(&entry->hlist, &hhd);
-                        if (rec_entry)
-                                free_hash_entry(hash, rec_entry);
-                        hlist_del_rcu(&entry->node);
-                        list_add(&entry->free_list, &free_list);
                }
        }
+        /* Nothing found? */
+        if (!count) {
+                ret = -EINVAL;
+                goto out_unlock;
+        }
        mutex_lock(&ftrace_lock);
-        disabled = __disable_ftrace_function_probe();
-        /*
+        WARN_ON(probe->ref < count);
-         * Remove after the disable is called. Otherwise, if the last
-         * probe is removed, a null hash means *all enabled*.
+        probe->ref -= count;
-         */
-        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+        if (ftrace_hash_empty(hash))
+                ftrace_shutdown(&probe->ops, 0);
+        ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash,
+                                              hash, 1);
        /* still need to update the function call sites */
-        if (ftrace_enabled && !disabled)
+        if (ftrace_enabled && !ftrace_hash_empty(hash))
-                ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+                ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS,
                                       &old_hash_ops);
        synchronize_sched();
-        if (!ret)
-                free_ftrace_hash_rcu(old_hash);
-        list_for_each_entry_safe(entry, p, &free_list, free_list) {
+        hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) {
-                list_del(&entry->free_list);
+                hlist_del(&entry->hlist);
-                ftrace_free_entry(entry);
+                if (probe_ops->free)
+                        probe_ops->free(probe_ops, tr, entry->ip, probe->data);
+                kfree(entry);
        }
        mutex_unlock(&ftrace_lock);
 out_unlock:
-        mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
+        mutex_unlock(&probe->ops.func_hash->regex_lock);
        free_ftrace_hash(hash);
-}
-void
+        release_probe(probe);
-unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
-                                void *data)
-{
-        __unregister_ftrace_function_probe(glob, ops, data,
-                                          PROBE_TEST_FUNC | PROBE_TEST_DATA);
-}
-void
+        return ret;
-unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
-{
+ err_unlock_ftrace:
-        __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
+        mutex_unlock(&ftrace_lock);
+        return ret;
 }
-void unregister_ftrace_function_probe_all(char *glob)
+void clear_ftrace_function_probes(struct trace_array *tr)
 {
-        __unregister_ftrace_function_probe(glob, NULL, NULL, 0);
+        struct ftrace_func_probe *probe, *n;
+        list_for_each_entry_safe(probe, n, &tr->func_probes, list)
+                unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops);
 }
 static LIST_HEAD(ftrace_commands);
@@ -4068,9 +4313,11 @@ __init int unregister_ftrace_command(struct ftrace_func_command *cmd)
        return ret;
 }
-static int ftrace_process_regex(struct ftrace_hash *hash,
+static int ftrace_process_regex(struct ftrace_iterator *iter,
                                char *buff, int len, int enable)
 {
+        struct ftrace_hash *hash = iter->hash;
+        struct trace_array *tr = iter->ops->private;
        char *func, *command, *next = buff;
        struct ftrace_func_command *p;
        int ret = -EINVAL;
@@ -4090,10 +4337,13 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
        command = strsep(&next, ":");
+        if (WARN_ON_ONCE(!tr))
+                return -EINVAL;
        mutex_lock(&ftrace_cmd_mutex);
        list_for_each_entry(p, &ftrace_commands, list) {
                if (strcmp(p->name, command) == 0) {
-                        ret = p->func(hash, func, command, next, enable);
+                        ret = p->func(tr, hash, func, command, next, enable);
                        goto out_unlock;
                }
        }
@@ -4130,7 +4380,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
        if (read >= 0 && trace_parser_loaded(parser) &&
            !trace_parser_cont(parser)) {
-                ret = ftrace_process_regex(iter->hash, parser->buffer,
+                ret = ftrace_process_regex(iter, parser->buffer,
                                           parser->idx, enable);
                trace_parser_clear(parser);
                if (ret < 0)
@@ -4175,44 +4425,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
        return add_hash_entry(hash, ip);
 }
-static void ftrace_ops_update_code(struct ftrace_ops *ops,
-                                   struct ftrace_ops_hash *old_hash)
-{
-        struct ftrace_ops *op;
-        if (!ftrace_enabled)
-                return;
-        if (ops->flags & FTRACE_OPS_FL_ENABLED) {
-                ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
-                return;
-        }
-        /*
-         * If this is the shared global_ops filter, then we need to
-         * check if there is another ops that shares it, is enabled.
-         * If so, we still need to run the modify code.
-         */
-        if (ops->func_hash != &global_ops.local_hash)
-                return;
-        do_for_each_ftrace_op(op, ftrace_ops_list) {
-                if (op->func_hash == &global_ops.local_hash &&
-                    op->flags & FTRACE_OPS_FL_ENABLED) {
-                        ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
-                        /* Only need to do this once */
-                        return;
-                }
-        } while_for_each_ftrace_op(op);
-}
 static int
 ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
                unsigned long ip, int remove, int reset, int enable)
 {
        struct ftrace_hash **orig_hash;
-        struct ftrace_ops_hash old_hash_ops;
-        struct ftrace_hash *old_hash;
        struct ftrace_hash *hash;
        int ret;
@@ -4247,14 +4464,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        }
        mutex_lock(&ftrace_lock);
-        old_hash = *orig_hash;
+        ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
-        old_hash_ops.filter_hash = ops->func_hash->filter_hash;
-        old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
-        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
-        if (!ret) {
-                ftrace_ops_update_code(ops, &old_hash_ops);
-                free_ftrace_hash_rcu(old_hash);
-        }
        mutex_unlock(&ftrace_lock);
 out_regex_unlock:
@@ -4493,10 +4703,8 @@ static void __init set_ftrace_early_filters(void)
 int ftrace_regex_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = (struct seq_file *)file->private_data;
-        struct ftrace_ops_hash old_hash_ops;
        struct ftrace_iterator *iter;
        struct ftrace_hash **orig_hash;
-        struct ftrace_hash *old_hash;
        struct trace_parser *parser;
        int filter_hash;
        int ret;
@@ -4526,16 +4734,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
                        orig_hash = &iter->ops->func_hash->notrace_hash;
                mutex_lock(&ftrace_lock);
-                old_hash = *orig_hash;
+                ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
-                old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
+                                                      iter->hash, filter_hash);
-                old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
-                ret = ftrace_hash_move(iter->ops, filter_hash,
-                                       orig_hash, iter->hash);
-                if (!ret) {
-                        ftrace_ops_update_code(iter->ops, &old_hash_ops);
-                        free_ftrace_hash_rcu(old_hash);
-                }
                mutex_unlock(&ftrace_lock);
+        } else {
+                /* For read only, the hash is the ops hash */
+                iter->hash = NULL;
        }
        mutex_unlock(&iter->ops->func_hash->regex_lock);
@@ -5274,6 +5478,50 @@ void ftrace_module_init(struct module *mod)
 }
 #endif /* CONFIG_MODULES */
+void __init ftrace_free_init_mem(void)
+{
+        unsigned long start = (unsigned long)(&__init_begin);
+        unsigned long end = (unsigned long)(&__init_end);
+        struct ftrace_page **last_pg = &ftrace_pages_start;
+        struct ftrace_page *pg;
+        struct dyn_ftrace *rec;
+        struct dyn_ftrace key;
+        int order;
+        key.ip = start;
+        key.flags = end;        /* overload flags, as it is unsigned long */
+        mutex_lock(&ftrace_lock);
+        for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
+                if (end < pg->records[0].ip ||
+                    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+                        continue;
+ again:
+                rec = bsearch(&key, pg->records, pg->index,
+                              sizeof(struct dyn_ftrace),
+                              ftrace_cmp_recs);
+                if (!rec)
+                        continue;
+                pg->index--;
+                if (!pg->index) {
+                        *last_pg = pg->next;
+                        order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+                        free_pages((unsigned long)pg->records, order);
+                        kfree(pg);
+                        pg = container_of(last_pg, struct ftrace_page, next);
+                        if (!(*last_pg))
+                                ftrace_pages = pg;
+                        continue;
+                }
+                memmove(rec, rec + 1,
+                        (pg->index - (rec - pg->records)) * sizeof(*rec));
+                /* More than one function may be in this block */
+                goto again;
+        }
+        mutex_unlock(&ftrace_lock);
+}
 void __init ftrace_init(void)
 {
        extern unsigned long __start_mcount_loc[];
@@ -5316,25 +5564,13 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 static void ftrace_update_trampoline(struct ftrace_ops *ops)
 {
-/*
- * Currently there's no safe way to free a trampoline when the kernel
- * is configured with PREEMPT. That is because a task could be preempted
- * when it jumped to the trampoline, it may be preempted for a long time
- * depending on the system load, and currently there's no way to know
- * when it will be off the trampoline. If the trampoline is freed
- * too early, when the task runs again, it will be executing on freed
- * memory and crash.
- */
-#ifdef CONFIG_PREEMPT
-        /* Currently, only non dynamic ops can have a trampoline */
-        if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
-                return;
-#endif
        arch_ftrace_update_trampoline(ops);
 }
+void ftrace_init_trace_array(struct trace_array *tr)
+{
+        INIT_LIST_HEAD(&tr->func_probes);
+}
 #else
 static struct ftrace_ops global_ops = {
@@ -5389,6 +5625,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr)
 {
        tr->ops = &global_ops;
        tr->ops->private = tr;
+        ftrace_init_trace_array(tr);
 }
 void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
@@ -5543,6 +5780,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
                       trace_ignore_this_task(pid_list, next));
 }
+static void
+ftrace_pid_follow_sched_process_fork(void *data,
+                                     struct task_struct *self,
+                                     struct task_struct *task)
+{
+        struct trace_pid_list *pid_list;
+        struct trace_array *tr = data;
+        pid_list = rcu_dereference_sched(tr->function_pids);
+        trace_filter_add_remove_task(pid_list, self, task);
+}
+static void
+ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
+{
+        struct trace_pid_list *pid_list;
+        struct trace_array *tr = data;
+        pid_list = rcu_dereference_sched(tr->function_pids);
+        trace_filter_add_remove_task(pid_list, NULL, task);
+}
+void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
+{
+        if (enable) {
+                register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
+                                                  tr);
+                register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+                                                  tr);
+        } else {
+                unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
+                                                    tr);
+                unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+                                                    tr);
+        }
+}
 static void clear_ftrace_pids(struct trace_array *tr)
 {
        struct trace_pid_list *pid_list;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ca47a4fa2986..4ae268e687fe 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -438,6 +438,7 @@ struct ring_buffer_per_cpu {
        raw_spinlock_t                  reader_lock;    /* serialize readers */
        arch_spinlock_t                 lock;
        struct lock_class_key           lock_key;
+        struct buffer_data_page         *free_page;
        unsigned long                   nr_pages;
        unsigned int                    current_context;
        struct list_head                *pages;
@@ -4389,9 +4390,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 {
-        struct buffer_data_page *bpage;
+        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+        struct buffer_data_page *bpage = NULL;
+        unsigned long flags;
        struct page *page;
+        local_irq_save(flags);
+        arch_spin_lock(&cpu_buffer->lock);
+        if (cpu_buffer->free_page) {
+                bpage = cpu_buffer->free_page;
+                cpu_buffer->free_page = NULL;
+        }
+        arch_spin_unlock(&cpu_buffer->lock);
+        local_irq_restore(flags);
+        if (bpage)
+                goto out;
        page = alloc_pages_node(cpu_to_node(cpu),
                                GFP_KERNEL | __GFP_NORETRY, 0);
        if (!page)
@@ -4399,6 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
        bpage = page_address(page);
+ out:
        rb_init_page(bpage);
        return bpage;
@@ -4408,13 +4426,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
 /**
 * ring_buffer_free_read_page - free an allocated read page
 * @buffer: the buffer the page was allocate for
+ * @cpu: the cpu buffer the page came from
 * @data: the page to free
 *
 * Free a page allocated from ring_buffer_alloc_read_page.
 */
-void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
+void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
 {
-        free_page((unsigned long)data);
+        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+        struct buffer_data_page *bpage = data;
+        unsigned long flags;
+        local_irq_save(flags);
+        arch_spin_lock(&cpu_buffer->lock);
+        if (!cpu_buffer->free_page) {
+                cpu_buffer->free_page = bpage;
+                bpage = NULL;
+        }
+        arch_spin_unlock(&cpu_buffer->lock);
+        local_irq_restore(flags);
+        free_page((unsigned long)bpage);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index c190a4d5013c..9fbcaf567886 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -171,7 +171,7 @@ static enum event_status read_page(int cpu)
                        }
                }
        }
-        ring_buffer_free_read_page(buffer, bpage);
+        ring_buffer_free_read_page(buffer, cpu, bpage);
        if (ret < 0)
                return EVENT_DROPPED;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0ad75e9698f6..1122f151466f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -257,7 +257,7 @@ unsigned long long ns2usecs(u64 nsec)
 /* trace_flags that are default zero for instances */
 #define ZEROED_TRACE_FLAGS \
-        TRACE_ITER_EVENT_FORK
+        (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
 /*
 * The global_trace is the descriptor that holds the top-level tracing
@@ -757,7 +757,7 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer,
        return event;
 }
-static void tracer_tracing_on(struct trace_array *tr)
+void tracer_tracing_on(struct trace_array *tr)
 {
        if (tr->trace_buffer.buffer)
                ring_buffer_record_on(tr->trace_buffer.buffer);
@@ -894,23 +894,8 @@ int __trace_bputs(unsigned long ip, const char *str)
 EXPORT_SYMBOL_GPL(__trace_bputs);
 #ifdef CONFIG_TRACER_SNAPSHOT
-/**
+static void tracing_snapshot_instance(struct trace_array *tr)
- * trace_snapshot - take a snapshot of the current buffer.
- *
- * This causes a swap between the snapshot buffer and the current live
- * tracing buffer. You can use this to take snapshots of the live
- * trace when some condition is triggered, but continue to trace.
- *
- * Note, make sure to allocate the snapshot with either
- * a tracing_snapshot_alloc(), or by doing it manually
- * with: echo 1 > /sys/kernel/debug/tracing/snapshot
- *
- * If the snapshot buffer is not allocated, it will stop tracing.
- * Basically making a permanent snapshot.
- */
-void tracing_snapshot(void)
 {
-        struct trace_array *tr = &global_trace;
        struct tracer *tracer = tr->current_trace;
        unsigned long flags;
@@ -938,6 +923,27 @@ void tracing_snapshot(void)
        update_max_tr(tr, current, smp_processor_id());
        local_irq_restore(flags);
 }
+/**
+ * trace_snapshot - take a snapshot of the current buffer.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ *
+ * Note, make sure to allocate the snapshot with either
+ * a tracing_snapshot_alloc(), or by doing it manually
+ * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ *
+ * If the snapshot buffer is not allocated, it will stop tracing.
+ * Basically making a permanent snapshot.
+ */
+void tracing_snapshot(void)
+{
+        struct trace_array *tr = &global_trace;
+        tracing_snapshot_instance(tr);
+}
 EXPORT_SYMBOL_GPL(tracing_snapshot);
 static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
@@ -1039,7 +1045,7 @@ void tracing_snapshot_alloc(void)
 EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
 #endif /* CONFIG_TRACER_SNAPSHOT */
-static void tracer_tracing_off(struct trace_array *tr)
+void tracer_tracing_off(struct trace_array *tr)
 {
        if (tr->trace_buffer.buffer)
                ring_buffer_record_off(tr->trace_buffer.buffer);
@@ -1424,6 +1430,28 @@ static int wait_on_pipe(struct trace_iterator *iter, bool full)
 }
 #ifdef CONFIG_FTRACE_STARTUP_TEST
+static bool selftests_can_run;
+struct trace_selftests {
+        struct list_head                list;
+        struct tracer                   *type;
+};
+static LIST_HEAD(postponed_selftests);
+static int save_selftest(struct tracer *type)
+{
+        struct trace_selftests *selftest;
+        selftest = kmalloc(sizeof(*selftest), GFP_KERNEL);
+        if (!selftest)
+                return -ENOMEM;
+        selftest->type = type;
+        list_add(&selftest->list, &postponed_selftests);
+        return 0;
+}
 static int run_tracer_selftest(struct tracer *type)
 {
        struct trace_array *tr = &global_trace;
@@ -1434,6 +1462,14 @@ static int run_tracer_selftest(struct tracer *type)
                return 0;
        /*
+         * If a tracer registers early in boot up (before scheduling is
+         * initialized and such), then do not run its selftests yet.
+         * Instead, run it a little later in the boot process.
+         */
+        if (!selftests_can_run)
+                return save_selftest(type);
+        /*
         * Run a selftest on this tracer.
         * Here we reset the trace buffer, and set the current
         * tracer to be this tracer. The tracer can then run some
@@ -1482,6 +1518,47 @@ static int run_tracer_selftest(struct tracer *type)
        printk(KERN_CONT "PASSED\n");
        return 0;
 }
+static __init int init_trace_selftests(void)
+{
+        struct trace_selftests *p, *n;
+        struct tracer *t, **last;
+        int ret;
+        selftests_can_run = true;
+        mutex_lock(&trace_types_lock);
+        if (list_empty(&postponed_selftests))
+                goto out;
+        pr_info("Running postponed tracer tests:\n");
+        list_for_each_entry_safe(p, n, &postponed_selftests, list) {
+                ret = run_tracer_selftest(p->type);
+                /* If the test fails, then warn and remove from available_tracers */
+                if (ret < 0) {
+                        WARN(1, "tracer: %s failed selftest, disabling\n",
+                             p->type->name);
+                        last = &trace_types;
+                        for (t = trace_types; t; t = t->next) {
+                                if (t == p->type) {
+                                        *last = t->next;
+                                        break;
+                                }
+                                last = &t->next;
+                        }
+                }
+                list_del(&p->list);
+                kfree(p);
+        }
+ out:
+        mutex_unlock(&trace_types_lock);
+        return 0;
+}
+core_initcall(init_trace_selftests);
 #else
 static inline int run_tracer_selftest(struct tracer *type)
 {
@@ -1899,7 +1976,7 @@ static void __trace_find_cmdline(int pid, char comm[])
        map = savedcmd->map_pid_to_cmdline[pid];
        if (map != NO_CMDLINE_MAP)
-                strcpy(comm, get_saved_cmdlines(map));
+                strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
        else
                strcpy(comm, "<...>");
 }
@@ -1927,6 +2004,18 @@ void tracing_record_cmdline(struct task_struct *tsk)
                __this_cpu_write(trace_cmdline_save, false);
 }
+/*
+ * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
+ * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
+ * simplifies those functions and keeps them in sync.
+ */
+enum print_line_t trace_handle_return(struct trace_seq *s)
+{
+        return trace_seq_has_overflowed(s) ?
+                TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
+}
+EXPORT_SYMBOL_GPL(trace_handle_return);
 void
 tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
                             int pc)
@@ -2479,7 +2568,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc)
 {
-        __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
+        if (rcu_is_watching()) {
+                __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+                return;
+        }
+        /*
+         * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
+         * but if the above rcu_is_watching() failed, then the NMI
+         * triggered someplace critical, and rcu_irq_enter() should
+         * not be called from NMI.
+         */
+        if (unlikely(in_nmi()))
+                return;
+        /*
+         * It is possible that a function is being traced in a
+         * location that RCU is not watching. A call to
+         * rcu_irq_enter() will make sure that it is, but there's
+         * a few internal rcu functions that could be traced
+         * where that wont work either. In those cases, we just
+         * do nothing.
+         */
+        if (unlikely(rcu_irq_enter_disabled()))
+                return;
+        rcu_irq_enter_irqson();
+        __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+        rcu_irq_exit_irqson();
 }
 /**
@@ -3222,13 +3340,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
        if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
                return;
-        if (iter->started && cpumask_test_cpu(iter->cpu, iter->started))
+        if (cpumask_available(iter->started) &&
+            cpumask_test_cpu(iter->cpu, iter->started))
                return;
        if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
                return;
-        if (iter->started)
+        if (cpumask_available(iter->started))
                cpumask_set_cpu(iter->cpu, iter->started);
        /* Don't print started cpu buffer for the first entry of the trace */
@@ -4122,6 +4241,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
        if (mask == TRACE_ITER_EVENT_FORK)
                trace_event_follow_fork(tr, enabled);
+        if (mask == TRACE_ITER_FUNC_FORK)
+                ftrace_pid_follow_fork(tr, enabled);
        if (mask == TRACE_ITER_OVERWRITE) {
                ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -4355,6 +4477,7 @@ static const char readme_msg[] =
        "\t           -:[<group>/]<event>\n"
 #ifdef CONFIG_KPROBE_EVENTS
        "\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+  "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
        "\t    place: <path>:<offset>\n"
@@ -5529,7 +5652,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .partial        = partial_def,
                .nr_pages       = 0, /* This gets updated below. */
                .nr_pages_max   = PIPE_DEF_BUFFERS,
-                .flags          = flags,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
        };
@@ -5962,6 +6084,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 struct ftrace_buffer_info {
        struct trace_iterator   iter;
        void                    *spare;
+        unsigned int            spare_cpu;
        unsigned int            read;
 };
@@ -6291,9 +6414,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
                return -EBUSY;
 #endif
-        if (!info->spare)
+        if (!info->spare) {
                info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
                                                          iter->cpu_file);
+                info->spare_cpu = iter->cpu_file;
+        }
        if (!info->spare)
                return -ENOMEM;
@@ -6353,7 +6478,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
        __trace_array_put(iter->tr);
        if (info->spare)
-                ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
+                ring_buffer_free_read_page(iter->trace_buffer->buffer,
+                                           info->spare_cpu, info->spare);
        kfree(info);
        mutex_unlock(&trace_types_lock);
@@ -6364,6 +6490,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 struct buffer_ref {
        struct ring_buffer      *buffer;
        void                    *page;
+        int                     cpu;
        int                     ref;
 };
@@ -6375,7 +6502,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
        if (--ref->ref)
                return;
-        ring_buffer_free_read_page(ref->buffer, ref->page);
+        ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
        kfree(ref);
        buf->private = 0;
 }
@@ -6409,7 +6536,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
        if (--ref->ref)
                return;
-        ring_buffer_free_read_page(ref->buffer, ref->page);
+        ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
        kfree(ref);
        spd->partial[i].private = 0;
 }
@@ -6427,7 +6554,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                .pages          = pages_def,
                .partial        = partial_def,
                .nr_pages_max   = PIPE_DEF_BUFFERS,
-                .flags          = flags,
                .ops            = &buffer_pipe_buf_ops,
                .spd_release    = buffer_spd_release,
        };
@@ -6474,11 +6600,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                        kfree(ref);
                        break;
                }
+                ref->cpu = iter->cpu_file;
                r = ring_buffer_read_page(ref->buffer, &ref->page,
                                          len, iter->cpu_file, 1);
                if (r < 0) {
-                        ring_buffer_free_read_page(ref->buffer, ref->page);
+                        ring_buffer_free_read_page(ref->buffer, ref->cpu,
+                                                   ref->page);
                        kfree(ref);
                        break;
                }
@@ -6649,43 +6777,89 @@ static const struct file_operations tracing_dyn_info_fops = {
 #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
 static void
-ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
+                struct trace_array *tr, struct ftrace_probe_ops *ops,
+                void *data)
 {
-        tracing_snapshot();
+        tracing_snapshot_instance(tr);
 }
 static void
-ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
+                      struct trace_array *tr, struct ftrace_probe_ops *ops,
+                      void *data)
 {
-        unsigned long *count = (long *)data;
+        struct ftrace_func_mapper *mapper = data;
+        long *count = NULL;
-        if (!*count)
+        if (mapper)
-                return;
+                count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+        if (count) {
+                if (*count <= 0)
+                        return;
-        if (*count != -1)
                (*count)--;
+        }
-        tracing_snapshot();
+        tracing_snapshot_instance(tr);
 }
 static int
 ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
                      struct ftrace_probe_ops *ops, void *data)
 {
-        long count = (long)data;
+        struct ftrace_func_mapper *mapper = data;
+        long *count = NULL;
        seq_printf(m, "%ps:", (void *)ip);
        seq_puts(m, "snapshot");
-        if (count == -1)
+        if (mapper)
-                seq_puts(m, ":unlimited\n");
+                count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+        if (count)
+                seq_printf(m, ":count=%ld\n", *count);
        else
-                seq_printf(m, ":count=%ld\n", count);
+                seq_puts(m, ":unlimited\n");
        return 0;
 }
+static int
+ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
+                     unsigned long ip, void *init_data, void **data)
+{
+        struct ftrace_func_mapper *mapper = *data;
+        if (!mapper) {
+                mapper = allocate_ftrace_func_mapper();
+                if (!mapper)
+                        return -ENOMEM;
+                *data = mapper;
+        }
+        return ftrace_func_mapper_add_ip(mapper, ip, init_data);
+}
+static void
+ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
+                     unsigned long ip, void *data)
+{
+        struct ftrace_func_mapper *mapper = data;
+        if (!ip) {
+                if (!mapper)
+                        return;
+                free_ftrace_func_mapper(mapper, NULL);
+                return;
+        }
+        ftrace_func_mapper_remove_ip(mapper, ip);
+}
 static struct ftrace_probe_ops snapshot_probe_ops = {
        .func                   = ftrace_snapshot,
        .print                  = ftrace_snapshot_print,
@@ -6694,10 +6868,12 @@ static struct ftrace_probe_ops snapshot_probe_ops = {
 static struct ftrace_probe_ops snapshot_count_probe_ops = {
        .func                   = ftrace_count_snapshot,
        .print                  = ftrace_snapshot_print,
+        .init                   = ftrace_snapshot_init,
+        .free                   = ftrace_snapshot_free,
 };
 static int
-ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
+ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
                               char *glob, char *cmd, char *param, int enable)
 {
        struct ftrace_probe_ops *ops;
@@ -6711,10 +6887,8 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
        ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops;
-        if (glob[0] == '!') {
+        if (glob[0] == '!')
-                unregister_ftrace_function_probe_func(glob+1, ops);
+                return unregister_ftrace_function_probe_func(glob+1, tr, ops);
-                return 0;
-        }
        if (!param)
                goto out_reg;
@@ -6733,11 +6907,11 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
                return ret;
 out_reg:
-        ret = alloc_snapshot(&global_trace);
+        ret = alloc_snapshot(tr);
        if (ret < 0)
                goto out;
-        ret = register_ftrace_function_probe(glob, ops, count);
+        ret = register_ftrace_function_probe(glob, tr, ops, count);
 out:
        return ret < 0 ? ret : 0;
@@ -7348,6 +7522,8 @@ static int instance_mkdir(const char *name)
                goto out_free_tr;
        }
+        ftrace_init_trace_array(tr);
        init_tracer_tracefs(tr, tr->dir);
        init_trace_flags_index(tr);
        __update_tracer_options(tr);
@@ -7403,6 +7579,7 @@ static int instance_rmdir(const char *name)
        }
        tracing_set_nop(tr);
+        clear_ftrace_function_probes(tr);
        event_trace_del_tracer(tr);
        ftrace_clear_pids(tr);
        ftrace_destroy_function_files(tr);
@@ -7968,6 +8145,9 @@ __init static int tracer_alloc_buffers(void)
        register_tracer(&nop_trace);
+        /* Function tracing may start here (via kernel command line) */
+        init_function_trace();
        /* All seems OK, enable tracing */
        tracing_disabled = 0;
@@ -8002,7 +8182,7 @@ out:
        return ret;
 }
-void __init trace_init(void)
+void __init early_trace_init(void)
 {
        if (tracepoint_printk) {
                tracepoint_print_iter =
@@ -8013,6 +8193,10 @@ void __init trace_init(void)
                        static_key_enable(&tracepoint_printk_key.key);
        }
        tracer_alloc_buffers();
+}
+void __init trace_init(void)
+{
        trace_event_init();
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d19d52d600d6..39fd77330aab 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -262,6 +262,9 @@ struct trace_array {
 #ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops       *ops;
        struct trace_pid_list   __rcu *function_pids;
+#ifdef CONFIG_DYNAMIC_FTRACE
+        struct list_head        func_probes;
+#endif
        /* function tracing enabled */
        int                     function_enabled;
 #endif
@@ -579,6 +582,8 @@ void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 bool tracing_is_disabled(void);
 int tracer_tracing_is_on(struct trace_array *tr);
+void tracer_tracing_on(struct trace_array *tr);
+void tracer_tracing_off(struct trace_array *tr);
 struct dentry *trace_create_file(const char *name,
                                 umode_t mode,
                                 struct dentry *parent,
@@ -696,6 +701,9 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
+void ftrace_init_trace_array(struct trace_array *tr);
+#else
+static inline void ftrace_init_trace_array(struct trace_array *tr) { }
 #endif
 #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
 extern int DYN_FTRACE_TEST_NAME(void);
@@ -880,6 +888,14 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 extern struct list_head ftrace_pids;
 #ifdef CONFIG_FUNCTION_TRACER
+struct ftrace_func_command {
+        struct list_head        list;
+        char                    *name;
+        int                     (*func)(struct trace_array *tr,
+                                        struct ftrace_hash *hash,
+                                        char *func, char *cmd,
+                                        char *params, int enable);
+};
 extern bool ftrace_filter_param __initdata;
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -897,6 +913,8 @@ void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
 void ftrace_init_tracefs_toplevel(struct trace_array *tr,
                                  struct dentry *d_tracer);
 void ftrace_clear_pids(struct trace_array *tr);
+int init_function_trace(void);
+void ftrace_pid_follow_fork(struct trace_array *tr, bool enable);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -916,15 +934,75 @@ static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
 static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
 static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
 static inline void ftrace_clear_pids(struct trace_array *tr) { }
+static inline int init_function_trace(void) { return 0; }
+static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
 #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+struct ftrace_probe_ops {
+        void                    (*func)(unsigned long ip,
+                                        unsigned long parent_ip,
+                                        struct trace_array *tr,
+                                        struct ftrace_probe_ops *ops,
+                                        void *data);
+        int                     (*init)(struct ftrace_probe_ops *ops,
+                                        struct trace_array *tr,
+                                        unsigned long ip, void *init_data,
+                                        void **data);
+        void                    (*free)(struct ftrace_probe_ops *ops,
+                                        struct trace_array *tr,
+                                        unsigned long ip, void *data);
+        int                     (*print)(struct seq_file *m,
+                                         unsigned long ip,
+                                         struct ftrace_probe_ops *ops,
+                                         void *data);
+};
+struct ftrace_func_mapper;
+typedef int (*ftrace_mapper_func)(void *data);
+struct ftrace_func_mapper *allocate_ftrace_func_mapper(void);
+void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
+                                           unsigned long ip);
+int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
+                               unsigned long ip, void *data);
+void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper,
+                                   unsigned long ip);
+void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
+                             ftrace_mapper_func free_func);
+extern int
+register_ftrace_function_probe(char *glob, struct trace_array *tr,
+                               struct ftrace_probe_ops *ops, void *data);
+extern int
+unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
+                                      struct ftrace_probe_ops *ops);
+extern void clear_ftrace_function_probes(struct trace_array *tr);
+int register_ftrace_command(struct ftrace_func_command *cmd);
+int unregister_ftrace_command(struct ftrace_func_command *cmd);
 void ftrace_create_filter_files(struct ftrace_ops *ops,
                                struct dentry *parent);
 void ftrace_destroy_filter_files(struct ftrace_ops *ops);
 #else
+struct ftrace_func_command;
+static inline __init int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+        return -EINVAL;
+}
+static inline __init int unregister_ftrace_command(char *cmd_name)
+{
+        return -EINVAL;
+}
+static inline void clear_ftrace_function_probes(struct trace_array *tr)
+{
+}
 /*
 * The ops parameter passed in is usually undefined.
 * This must be a macro.
@@ -989,11 +1067,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 #ifdef CONFIG_FUNCTION_TRACER
 # define FUNCTION_FLAGS                                         \
-                C(FUNCTION,             "function-trace"),
+                C(FUNCTION,             "function-trace"),      \
+                C(FUNC_FORK,            "function-fork"),
 # define FUNCTION_DEFAULT_FLAGS         TRACE_ITER_FUNCTION
 #else
 # define FUNCTION_FLAGS
 # define FUNCTION_DEFAULT_FLAGS         0UL
+# define TRACE_ITER_FUNC_FORK           0UL
 #endif
 #ifdef CONFIG_STACKTRACE
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e49fbe901cfc..16a8cf02eee9 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -153,10 +153,18 @@ static int benchmark_event_kthread(void *arg)
                trace_do_benchmark();
                /*
-                 * We don't go to sleep, but let others
+                 * We don't go to sleep, but let others run as well.
-                 * run as well.
+                 * This is bascially a "yield()" to let any task that
+                 * wants to run, schedule in, but if the CPU is idle,
+                 * we'll keep burning cycles.
+                 *
+                 * Note the _rcu_qs() version of cond_resched() will
+                 * notify synchronize_rcu_tasks() that this thread has
+                 * passed a quiescent state for rcu_tasks. Otherwise
+                 * this thread will never voluntarily schedule which would
+                 * block synchronize_rcu_tasks() indefinitely.
                 */
-                cond_resched();
+                cond_resched_rcu_qs();
        }
        return 0;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c203ac4df791..adcdbbeae010 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -348,14 +348,14 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
                __field(        u64,                    duration        )
                __field(        u64,                    outer_duration  )
                __field(        u64,                    nmi_total_ts    )
-                __field_struct( struct timespec,        timestamp       )
+                __field_struct( struct timespec64,      timestamp       )
-                __field_desc(   long,   timestamp,      tv_sec          )
+                __field_desc(   s64,    timestamp,      tv_sec          )
                __field_desc(   long,   timestamp,      tv_nsec         )
                __field(        unsigned int,           nmi_count       )
                __field(        unsigned int,           seqnum          )
        ),
-        F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+        F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
                 __entry->seqnum,
                 __entry->tv_sec,
                 __entry->tv_nsec,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 93116549a284..e7973e10398c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2460,15 +2460,8 @@ struct event_probe_data {
        bool                            enable;
 };
-static void
+static void update_event_probe(struct event_probe_data *data)
-event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
 {
-        struct event_probe_data **pdata = (struct event_probe_data **)_data;
-        struct event_probe_data *data = *pdata;
-        if (!data)
-                return;
        if (data->enable)
                clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
        else
@@ -2476,77 +2469,141 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
 }
 static void
-event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+event_enable_probe(unsigned long ip, unsigned long parent_ip,
+                   struct trace_array *tr, struct ftrace_probe_ops *ops,
+                   void *data)
 {
-        struct event_probe_data **pdata = (struct event_probe_data **)_data;
+        struct ftrace_func_mapper *mapper = data;
-        struct event_probe_data *data = *pdata;
+        struct event_probe_data *edata;
+        void **pdata;
-        if (!data)
+        pdata = ftrace_func_mapper_find_ip(mapper, ip);
+        if (!pdata || !*pdata)
+                return;
+        edata = *pdata;
+        update_event_probe(edata);
+}
+static void
+event_enable_count_probe(unsigned long ip, unsigned long parent_ip,
+                         struct trace_array *tr, struct ftrace_probe_ops *ops,
+                         void *data)
+{
+        struct ftrace_func_mapper *mapper = data;
+        struct event_probe_data *edata;
+        void **pdata;
+        pdata = ftrace_func_mapper_find_ip(mapper, ip);
+        if (!pdata || !*pdata)
                return;
-        if (!data->count)
+        edata = *pdata;
+        if (!edata->count)
                return;
        /* Skip if the event is in a state we want to switch to */
-        if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
+        if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
                return;
-        if (data->count != -1)
+        if (edata->count != -1)
-                (data->count)--;
+                (edata->count)--;
-        event_enable_probe(ip, parent_ip, _data);
+        update_event_probe(edata);
 }
 static int
 event_enable_print(struct seq_file *m, unsigned long ip,
-                      struct ftrace_probe_ops *ops, void *_data)
+                   struct ftrace_probe_ops *ops, void *data)
 {
-        struct event_probe_data *data = _data;
+        struct ftrace_func_mapper *mapper = data;
+        struct event_probe_data *edata;
+        void **pdata;
+        pdata = ftrace_func_mapper_find_ip(mapper, ip);
+        if (WARN_ON_ONCE(!pdata || !*pdata))
+                return 0;
+        edata = *pdata;
        seq_printf(m, "%ps:", (void *)ip);
        seq_printf(m, "%s:%s:%s",
-                   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+                   edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
-                   data->file->event_call->class->system,
+                   edata->file->event_call->class->system,
-                   trace_event_name(data->file->event_call));
+                   trace_event_name(edata->file->event_call));
-        if (data->count == -1)
+        if (edata->count == -1)
                seq_puts(m, ":unlimited\n");
        else
-                seq_printf(m, ":count=%ld\n", data->count);
+                seq_printf(m, ":count=%ld\n", edata->count);
        return 0;
 }
 static int
-event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
+event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
-                  void **_data)
+                  unsigned long ip, void *init_data, void **data)
 {
-        struct event_probe_data **pdata = (struct event_probe_data **)_data;
+        struct ftrace_func_mapper *mapper = *data;
-        struct event_probe_data *data = *pdata;
+        struct event_probe_data *edata = init_data;
+        int ret;
+        if (!mapper) {
+                mapper = allocate_ftrace_func_mapper();
+                if (!mapper)
+                        return -ENODEV;
+                *data = mapper;
+        }
+        ret = ftrace_func_mapper_add_ip(mapper, ip, edata);
+        if (ret < 0)
+                return ret;
+        edata->ref++;
-        data->ref++;
+        return 0;
+}
+static int free_probe_data(void *data)
+{
+        struct event_probe_data *edata = data;
+        edata->ref--;
+        if (!edata->ref) {
+                /* Remove the SOFT_MODE flag */
+                __ftrace_event_enable_disable(edata->file, 0, 1);
+                module_put(edata->file->event_call->mod);
+                kfree(edata);
+        }
        return 0;
 }
 static void
-event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
+event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
-                  void **_data)
+                  unsigned long ip, void *data)
 {
-        struct event_probe_data **pdata = (struct event_probe_data **)_data;
+        struct ftrace_func_mapper *mapper = data;
-        struct event_probe_data *data = *pdata;
+        struct event_probe_data *edata;
-        if (WARN_ON_ONCE(data->ref <= 0))
+        if (!ip) {
+                if (!mapper)
+                        return;
+                free_ftrace_func_mapper(mapper, free_probe_data);
                return;
-        data->ref--;
-        if (!data->ref) {
-                /* Remove the SOFT_MODE flag */
-                __ftrace_event_enable_disable(data->file, 0, 1);
-                module_put(data->file->event_call->mod);
-                kfree(data);
        }
-        *pdata = NULL;
+        edata = ftrace_func_mapper_remove_ip(mapper, ip);
+        if (WARN_ON_ONCE(!edata))
+                return;
+        if (WARN_ON_ONCE(edata->ref <= 0))
+                return;
+        free_probe_data(edata);
 }
 static struct ftrace_probe_ops event_enable_probe_ops = {
@@ -2578,10 +2635,9 @@ static struct ftrace_probe_ops event_disable_count_probe_ops = {
 };
 static int
-event_enable_func(struct ftrace_hash *hash,
+event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
                  char *glob, char *cmd, char *param, int enabled)
 {
-        struct trace_array *tr = top_trace_array();
        struct trace_event_file *file;
        struct ftrace_probe_ops *ops;
        struct event_probe_data *data;
@@ -2619,12 +2675,12 @@ event_enable_func(struct ftrace_hash *hash,
                ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
        if (glob[0] == '!') {
-                unregister_ftrace_function_probe_func(glob+1, ops);
+                ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
-                ret = 0;
                goto out;
        }
        ret = -ENOMEM;
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
                goto out;
@@ -2661,7 +2717,8 @@ event_enable_func(struct ftrace_hash *hash,
        ret = __ftrace_event_enable_disable(file, 1, 1);
        if (ret < 0)
                goto out_put;
-        ret = register_ftrace_function_probe(glob, ops, data);
+        ret = register_ftrace_function_probe(glob, tr, ops, data);
        /*
         * The above returns on success the # of functions enabled,
         * but if it didn't find any functions it returns zero.
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0efa00d80623..a3bddbfd0874 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -267,10 +267,14 @@ static struct tracer function_trace __tracer_data =
 };
 #ifdef CONFIG_DYNAMIC_FTRACE
-static void update_traceon_count(void **data, bool on)
+static void update_traceon_count(struct ftrace_probe_ops *ops,
+                                 unsigned long ip,
+                                 struct trace_array *tr, bool on,
+                                 void *data)
 {
-        long *count = (long *)data;
+        struct ftrace_func_mapper *mapper = data;
-        long old_count = *count;
+        long *count;
+        long old_count;
        /*
         * Tracing gets disabled (or enabled) once per count.
@@ -301,23 +305,22 @@ static void update_traceon_count(void **data, bool on)
         * setting the tracing_on file. But we currently don't care
         * about that.
         */
-        if (!old_count)
+        count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+        old_count = *count;
+        if (old_count <= 0)
                return;
        /* Make sure we see count before checking tracing state */
        smp_rmb();
-        if (on == !!tracing_is_on())
+        if (on == !!tracer_tracing_is_on(tr))
                return;
        if (on)
-                tracing_on();
+                tracer_tracing_on(tr);
        else
-                tracing_off();
+                tracer_tracing_off(tr);
-        /* unlimited? */
-        if (old_count == -1)
-                return;
        /* Make sure tracing state is visible before updating count */
        smp_wmb();
@@ -326,33 +329,41 @@ static void update_traceon_count(void **data, bool on)
 }
 static void
-ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon_count(unsigned long ip, unsigned long parent_ip,
+                     struct trace_array *tr, struct ftrace_probe_ops *ops,
+                     void *data)
 {
-        update_traceon_count(data, 1);
+        update_traceon_count(ops, ip, tr, 1, data);
 }
 static void
-ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip,
+                      struct trace_array *tr, struct ftrace_probe_ops *ops,
+                      void *data)
 {
-        update_traceon_count(data, 0);
+        update_traceon_count(ops, ip, tr, 0, data);
 }
 static void
-ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon(unsigned long ip, unsigned long parent_ip,
+               struct trace_array *tr, struct ftrace_probe_ops *ops,
+               void *data)
 {
-        if (tracing_is_on())
+        if (tracer_tracing_is_on(tr))
                return;
-        tracing_on();
+        tracer_tracing_on(tr);
 }
 static void
-ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
+                struct trace_array *tr, struct ftrace_probe_ops *ops,
+                void *data)
 {
-        if (!tracing_is_on())
+        if (!tracer_tracing_is_on(tr))
                return;
-        tracing_off();
+        tracer_tracing_off(tr);
 }
 /*
@@ -364,144 +375,218 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 */
 #define STACK_SKIP 4
+static __always_inline void trace_stack(struct trace_array *tr)
+{
+        unsigned long flags;
+        int pc;
+        local_save_flags(flags);
+        pc = preempt_count();
+        __trace_stack(tr, flags, STACK_SKIP, pc);
+}
 static void
-ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_stacktrace(unsigned long ip, unsigned long parent_ip,
+                  struct trace_array *tr, struct ftrace_probe_ops *ops,
+                  void *data)
 {
-        trace_dump_stack(STACK_SKIP);
+        trace_stack(tr);
 }
 static void
-ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip,
+                        struct trace_array *tr, struct ftrace_probe_ops *ops,
+                        void *data)
 {
-        long *count = (long *)data;
+        struct ftrace_func_mapper *mapper = data;
+        long *count;
        long old_count;
        long new_count;
+        if (!tracing_is_on())
+                return;
+        /* unlimited? */
+        if (!mapper) {
+                trace_stack(tr);
+                return;
+        }
+        count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
        /*
         * Stack traces should only execute the number of times the
         * user specified in the counter.
         */
        do {
-                if (!tracing_is_on())
-                        return;
                old_count = *count;
                if (!old_count)
                        return;
-                /* unlimited? */
-                if (old_count == -1) {
-                        trace_dump_stack(STACK_SKIP);
-                        return;
-                }
                new_count = old_count - 1;
                new_count = cmpxchg(count, old_count, new_count);
                if (new_count == old_count)
-                        trace_dump_stack(STACK_SKIP);
+                        trace_stack(tr);
+                if (!tracing_is_on())
+                        return;
        } while (new_count != old_count);
 }
-static int update_count(void **data)
+static int update_count(struct ftrace_probe_ops *ops, unsigned long ip,
+                        void *data)
 {
-        unsigned long *count = (long *)data;
+        struct ftrace_func_mapper *mapper = data;
+        long *count = NULL;
-        if (!*count)
+        if (mapper)
-                return 0;
+                count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
-        if (*count != -1)
+        if (count) {
+                if (*count <= 0)
+                        return 0;
                (*count)--;
+        }
        return 1;
 }
 static void
-ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_dump_probe(unsigned long ip, unsigned long parent_ip,
+                  struct trace_array *tr, struct ftrace_probe_ops *ops,
+                  void *data)
 {
-        if (update_count(data))
+        if (update_count(ops, ip, data))
                ftrace_dump(DUMP_ALL);
 }
 /* Only dump the current CPU buffer. */
 static void
-ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip,
+                     struct trace_array *tr, struct ftrace_probe_ops *ops,
+                     void *data)
 {
-        if (update_count(data))
+        if (update_count(ops, ip, data))
                ftrace_dump(DUMP_ORIG);
 }
 static int
 ftrace_probe_print(const char *name, struct seq_file *m,
-                   unsigned long ip, void *data)
+                   unsigned long ip, struct ftrace_probe_ops *ops,
+                   void *data)
 {
-        long count = (long)data;
+        struct ftrace_func_mapper *mapper = data;
+        long *count = NULL;
        seq_printf(m, "%ps:%s", (void *)ip, name);
-        if (count == -1)
+        if (mapper)
-                seq_puts(m, ":unlimited\n");
+                count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+        if (count)
+                seq_printf(m, ":count=%ld\n", *count);
        else
-                seq_printf(m, ":count=%ld\n", count);
+                seq_puts(m, ":unlimited\n");
        return 0;
 }
 static int
 ftrace_traceon_print(struct seq_file *m, unsigned long ip,
-                         struct ftrace_probe_ops *ops, void *data)
+                     struct ftrace_probe_ops *ops,
+                     void *data)
 {
-        return ftrace_probe_print("traceon", m, ip, data);
+        return ftrace_probe_print("traceon", m, ip, ops, data);
 }
 static int
 ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
                         struct ftrace_probe_ops *ops, void *data)
 {
-        return ftrace_probe_print("traceoff", m, ip, data);
+        return ftrace_probe_print("traceoff", m, ip, ops, data);
 }
 static int
 ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
                        struct ftrace_probe_ops *ops, void *data)
 {
-        return ftrace_probe_print("stacktrace", m, ip, data);
+        return ftrace_probe_print("stacktrace", m, ip, ops, data);
 }
 static int
 ftrace_dump_print(struct seq_file *m, unsigned long ip,
                        struct ftrace_probe_ops *ops, void *data)
 {
-        return ftrace_probe_print("dump", m, ip, data);
+        return ftrace_probe_print("dump", m, ip, ops, data);
 }
 static int
 ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
                        struct ftrace_probe_ops *ops, void *data)
 {
-        return ftrace_probe_print("cpudump", m, ip, data);
+        return ftrace_probe_print("cpudump", m, ip, ops, data);
+}
+static int
+ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
+                  unsigned long ip, void *init_data, void **data)
+{
+        struct ftrace_func_mapper *mapper = *data;
+        if (!mapper) {
+                mapper = allocate_ftrace_func_mapper();
+                if (!mapper)
+                        return -ENOMEM;
+                *data = mapper;
+        }
+        return ftrace_func_mapper_add_ip(mapper, ip, init_data);
+}
+static void
+ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
+                  unsigned long ip, void *data)
+{
+        struct ftrace_func_mapper *mapper = data;
+        if (!ip) {
+                free_ftrace_func_mapper(mapper, NULL);
+                return;
+        }
+        ftrace_func_mapper_remove_ip(mapper, ip);
 }
 static struct ftrace_probe_ops traceon_count_probe_ops = {
        .func                   = ftrace_traceon_count,
        .print                  = ftrace_traceon_print,
+        .init                   = ftrace_count_init,
+        .free                   = ftrace_count_free,
 };
 static struct ftrace_probe_ops traceoff_count_probe_ops = {
        .func                   = ftrace_traceoff_count,
        .print                  = ftrace_traceoff_print,
+        .init                   = ftrace_count_init,
+        .free                   = ftrace_count_free,
 };
 static struct ftrace_probe_ops stacktrace_count_probe_ops = {
        .func                   = ftrace_stacktrace_count,
        .print                  = ftrace_stacktrace_print,
+        .init                   = ftrace_count_init,
+        .free                   = ftrace_count_free,
 };
 static struct ftrace_probe_ops dump_probe_ops = {
        .func                   = ftrace_dump_probe,
        .print                  = ftrace_dump_print,
+        .init                   = ftrace_count_init,
+        .free                   = ftrace_count_free,
 };
 static struct ftrace_probe_ops cpudump_probe_ops = {
@@ -525,7 +610,8 @@ static struct ftrace_probe_ops stacktrace_probe_ops = {
 };
 static int
-ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
+ftrace_trace_probe_callback(struct trace_array *tr,
+                            struct ftrace_probe_ops *ops,
                            struct ftrace_hash *hash, char *glob,
                            char *cmd, char *param, int enable)
 {
@@ -537,10 +623,8 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
        if (!enable)
                return -EINVAL;
-        if (glob[0] == '!') {
+        if (glob[0] == '!')
-                unregister_ftrace_function_probe_func(glob+1, ops);
+                return unregister_ftrace_function_probe_func(glob+1, tr, ops);
-                return 0;
-        }
        if (!param)
                goto out_reg;
@@ -559,13 +643,13 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
                return ret;
 out_reg:
-        ret = register_ftrace_function_probe(glob, ops, count);
+        ret = register_ftrace_function_probe(glob, tr, ops, count);
        return ret < 0 ? ret : 0;
 }
 static int
-ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,
                            char *glob, char *cmd, char *param, int enable)
 {
        struct ftrace_probe_ops *ops;
@@ -576,24 +660,24 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
        else
                ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
-        return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+        return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
                                           param, enable);
 }
 static int
-ftrace_stacktrace_callback(struct ftrace_hash *hash,
+ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash,
                           char *glob, char *cmd, char *param, int enable)
 {
        struct ftrace_probe_ops *ops;
        ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
-        return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+        return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
                                           param, enable);
 }
 static int
-ftrace_dump_callback(struct ftrace_hash *hash,
+ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash,
                           char *glob, char *cmd, char *param, int enable)
 {
        struct ftrace_probe_ops *ops;
@@ -601,12 +685,12 @@ ftrace_dump_callback(struct ftrace_hash *hash,
        ops = &dump_probe_ops;
        /* Only dump once. */
-        return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+        return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
                                           "1", enable);
 }
 static int
-ftrace_cpudump_callback(struct ftrace_hash *hash,
+ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash,
                           char *glob, char *cmd, char *param, int enable)
 {
        struct ftrace_probe_ops *ops;
@@ -614,7 +698,7 @@ ftrace_cpudump_callback(struct ftrace_hash *hash,
        ops = &cpudump_probe_ops;
        /* Only dump once. */
-        return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+        return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
                                           "1", enable);
 }
@@ -687,9 +771,8 @@ static inline int init_func_cmd_traceon(void)
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
-static __init int init_function_trace(void)
+__init int init_function_trace(void)
 {
        init_func_cmd_traceon();
        return register_tracer(&function_trace);
 }
-core_initcall(init_function_trace);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 21ea6ae77d93..d7c8e4ec3d9d 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -79,12 +79,12 @@ static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
 /* Individual latency samples are stored here when detected. */
 struct hwlat_sample {
-        u64             seqnum;         /* unique sequence */
+        u64                     seqnum;         /* unique sequence */
-        u64             duration;       /* delta */
+        u64                     duration;       /* delta */
-        u64             outer_duration; /* delta (outer loop) */
+        u64                     outer_duration; /* delta (outer loop) */
-        u64             nmi_total_ts;   /* Total time spent in NMIs */
+        u64                     nmi_total_ts;   /* Total time spent in NMIs */
-        struct timespec timestamp;      /* wall time */
+        struct timespec64       timestamp;      /* wall time */
-        int             nmi_count;      /* # NMIs during this sample */
+        int                     nmi_count;      /* # NMIs during this sample */
 };
 /* keep the global state somewhere. */
@@ -250,7 +250,7 @@ static int get_sample(void)
                s.seqnum = hwlat_data.count;
                s.duration = sample;
                s.outer_duration = outer_sample;
-                s.timestamp = CURRENT_TIME;
+                ktime_get_real_ts64(&s.timestamp);
                s.nmi_total_ts = nmi_total_ts;
                s.nmi_count = nmi_count;
                trace_hwlat_sample(&s);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5f688cc724f0..c129fca6ec99 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -25,6 +25,7 @@
 #include "trace_probe.h"
 #define KPROBE_EVENT_SYSTEM "kprobes"
+#define KRETPROBE_MAXACTIVE_MAX 4096
 /**
 * Kprobe event core functions
@@ -282,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
                                             void *addr,
                                             const char *symbol,
                                             unsigned long offs,
+                                             int maxactive,
                                             int nargs, bool is_return)
 {
        struct trace_kprobe *tk;
@@ -309,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
        else
                tk->rp.kp.pre_handler = kprobe_dispatcher;
+        tk->rp.maxactive = maxactive;
        if (!event || !is_good_name(event)) {
                ret = -EINVAL;
                goto error;
@@ -598,8 +602,10 @@ static int create_trace_kprobe(int argc, char **argv)
 {
        /*
         * Argument syntax:
-         *  - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
+         *  - Add kprobe:
-         *  - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
+         *      p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
+         *  - Add kretprobe:
+         *      r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
         * Fetch args:
         *  $retval     : fetch return value
         *  $stack      : fetch stack address
@@ -619,6 +625,7 @@ static int create_trace_kprobe(int argc, char **argv)
        int i, ret = 0;
        bool is_return = false, is_delete = false;
        char *symbol = NULL, *event = NULL, *group = NULL;
+        int maxactive = 0;
        char *arg;
        unsigned long offset = 0;
        void *addr = NULL;
@@ -637,8 +644,28 @@ static int create_trace_kprobe(int argc, char **argv)
                return -EINVAL;
        }
-        if (argv[0][1] == ':') {
+        event = strchr(&argv[0][1], ':');
-                event = &argv[0][2];
+        if (event) {
+                event[0] = '\0';
+                event++;
+        }
+        if (is_return && isdigit(argv[0][1])) {
+                ret = kstrtouint(&argv[0][1], 0, &maxactive);
+                if (ret) {
+                        pr_info("Failed to parse maxactive.\n");
+                        return ret;
+                }
+                /* kretprobes instances are iterated over via a list. The
+                 * maximum should stay reasonable.
+                 */
+                if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
+                        pr_info("Maxactive is too big (%d > %d).\n",
+                                maxactive, KRETPROBE_MAXACTIVE_MAX);
+                        return -E2BIG;
+                }
+        }
+        if (event) {
                if (strchr(event, '/')) {
                        group = event;
                        event = strchr(group, '/') + 1;
@@ -681,10 +708,6 @@ static int create_trace_kprobe(int argc, char **argv)
                return -EINVAL;
        }
        if (isdigit(argv[1][0])) {
-                if (is_return) {
-                        pr_info("Return probe point must be a symbol.\n");
-                        return -EINVAL;
-                }
                /* an address specified */
                ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
                if (ret) {
@@ -700,8 +723,9 @@ static int create_trace_kprobe(int argc, char **argv)
                        pr_info("Failed to parse symbol.\n");
                        return ret;
                }
-                if (offset && is_return) {
+                if (offset && is_return &&
-                        pr_info("Return probe must be used without offset.\n");
+                    !function_offset_within_entry(NULL, symbol, offset)) {
+                        pr_info("Given offset is not valid for return probe.\n");
                        return -EINVAL;
                }
        }
@@ -718,8 +742,8 @@ static int create_trace_kprobe(int argc, char **argv)
                                 is_return ? 'r' : 'p', addr);
                event = buf;
        }
-        tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
+        tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
-                               is_return);
+                               argc, is_return);
        if (IS_ERR(tk)) {
                pr_info("Failed to allocate trace_probe.(%d)\n",
                        (int)PTR_ERR(tk));
@@ -1511,6 +1535,11 @@ static __init int kprobe_trace_self_tests_init(void)
 end:
        release_all_trace_kprobes();
+        /*
+         * Wait for the optimizer work to finish. Otherwise it might fiddle
+         * with probes in already freed __init text.
+         */
+        wait_for_kprobe_optimizer();
        if (warn)
                pr_cont("NG: Some tests are failed. Please check them.\n");
        else
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02a4aeb22c47..08f9bab8089e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -4,7 +4,6 @@
 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
 *
 */
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
@@ -1161,11 +1160,11 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
        trace_assign_type(field, entry);
-        trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld",
+        trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld",
                         field->seqnum,
                         field->duration,
                         field->outer_duration,
-                         field->timestamp.tv_sec,
+                         (long long)field->timestamp.tv_sec,
                         field->timestamp.tv_nsec);
        if (field->nmi_count) {
@@ -1195,10 +1194,10 @@ trace_hwlat_raw(struct trace_iterator *iter, int flags,
        trace_assign_type(field, iter->ent);
-        trace_seq_printf(s, "%llu %lld %ld %09ld %u\n",
+        trace_seq_printf(s, "%llu %lld %lld %09ld %u\n",
                         field->duration,
                         field->outer_duration,
-                         field->timestamp.tv_sec,
+                         (long long)field->timestamp.tv_sec,
                         field->timestamp.tv_nsec,
                         field->seqnum);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5fb1f2c87e6b..76aa04d4c925 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -35,7 +35,7 @@ unsigned long stack_trace_max_size;
 arch_spinlock_t stack_trace_max_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-static DEFINE_PER_CPU(int, trace_active);
+DEFINE_PER_CPU(int, disable_stack_tracer);
 static DEFINE_MUTEX(stack_sysctl_mutex);
 int stack_tracer_enabled;
@@ -96,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack)
        if (in_nmi())
                return;
+        /*
+         * There's a slight chance that we are tracing inside the
+         * RCU infrastructure, and rcu_irq_enter() will not work
+         * as expected.
+         */
+        if (unlikely(rcu_irq_enter_disabled()))
+                return;
        local_irq_save(flags);
        arch_spin_lock(&stack_trace_max_lock);
@@ -207,13 +215,12 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
                 struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        unsigned long stack;
-        int cpu;
        preempt_disable_notrace();
-        cpu = raw_smp_processor_id();
        /* no atomic needed, we only modify this variable by this cpu */
-        if (per_cpu(trace_active, cpu)++ != 0)
+        __this_cpu_inc(disable_stack_tracer);
+        if (__this_cpu_read(disable_stack_tracer) != 1)
                goto out;
        ip += MCOUNT_INSN_SIZE;
@@ -221,7 +228,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
        check_stack(ip, &stack);
 out:
-        per_cpu(trace_active, cpu)--;
+        __this_cpu_dec(disable_stack_tracer);
        /* prevent recursion in schedule */
        preempt_enable_notrace();
 }
@@ -253,7 +260,6 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        long *ptr = filp->private_data;
        unsigned long val, flags;
        int ret;
-        int cpu;
        ret = kstrtoul_from_user(ubuf, count, 10, &val);
        if (ret)
@@ -264,16 +270,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        /*
         * In case we trace inside arch_spin_lock() or after (NMI),
         * we will cause circular lock, so we also need to increase
-         * the percpu trace_active here.
+         * the percpu disable_stack_tracer here.
         */
-        cpu = smp_processor_id();
+        __this_cpu_inc(disable_stack_tracer);
-        per_cpu(trace_active, cpu)++;
        arch_spin_lock(&stack_trace_max_lock);
        *ptr = val;
        arch_spin_unlock(&stack_trace_max_lock);
-        per_cpu(trace_active, cpu)--;
+        __this_cpu_dec(disable_stack_tracer);
        local_irq_restore(flags);
        return count;
@@ -307,12 +312,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-        int cpu;
        local_irq_disable();
-        cpu = smp_processor_id();
+        __this_cpu_inc(disable_stack_tracer);
-        per_cpu(trace_active, cpu)++;
        arch_spin_lock(&stack_trace_max_lock);
@@ -324,12 +326,9 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void t_stop(struct seq_file *m, void *p)
 {
-        int cpu;
        arch_spin_unlock(&stack_trace_max_lock);
-        cpu = smp_processor_id();
+        __this_cpu_dec(disable_stack_tracer);
-        per_cpu(trace_active, cpu)--;
        local_irq_enable();
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0168b7da1ea..c74bf39ef764 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool)
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);
-        init_timer_deferrable(&pool->idle_timer);
+        setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
-        pool->idle_timer.function = idle_worker_timeout;
+                               (unsigned long)pool);
-        pool->idle_timer.data = (unsigned long)pool;
        setup_timer(&pool->mayday_timer, pool_mayday_timeout,
                    (unsigned long)pool);
@@ -4735,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn:  the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+        long ret = -ENODEV;
+        get_online_cpus();
+        if (cpu_online(cpu))
+                ret = work_on_cpu(cpu, fn, arg);
+        put_online_cpus();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_FREEZER