90 files changed, 8186 insertions, 2918 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..790d83c7d160 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,3 +5,4 @@ config_data.h
 config_data.gz
 timeconst.h
 hz.bc
+x509_certificate_list
diff --git a/kernel/Makefile b/kernel/Makefile
index bbaf7d59c1bb..bc010ee272b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -137,9 +137,10 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
 ###############################################################################
 ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
 X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
                                $(or $(realpath $(CERT)),$(CERT))))
+X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
 ifeq ($(X509_CERTIFICATES),)
 $(warning *** No X.509 certificates found ***)
@@ -164,9 +165,9 @@ $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
 targets += $(obj)/.x509.list
 $(obj)/.x509.list:
        @echo $(X509_CERTIFICATES) >$@
+endif
 clean-files := x509_certificate_list .x509.list
-endif
 ifeq ($(CONFIG_MODULE_SIG),y)
 ###############################################################################
diff --git a/kernel/audit.c b/kernel/audit.c
index 906ae5a0233a..34c5a2310fbf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -41,6 +41,8 @@
 * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/init.h>
 #include <asm/types.h>
 #include <linux/atomic.h>
@@ -63,6 +65,7 @@
 #include <linux/freezer.h>
 #include <linux/tty.h>
 #include <linux/pid_namespace.h>
+#include <net/netns/generic.h>
 #include "audit.h"
@@ -76,16 +79,16 @@ static int	audit_initialized;
 #define AUDIT_OFF       0
 #define AUDIT_ON        1
 #define AUDIT_LOCKED    2
-int             audit_enabled;
+u32             audit_enabled;
-int             audit_ever_enabled;
+u32             audit_ever_enabled;
 EXPORT_SYMBOL_GPL(audit_enabled);
 /* Default state when kernel boots without any parameters. */
-static int      audit_default;
+static u32      audit_default;
 /* If auditing cannot proceed, audit_failure selects what happens. */
-static int      audit_failure = AUDIT_FAIL_PRINTK;
+static u32      audit_failure = AUDIT_FAIL_PRINTK;
 /*
 * If audit records are to be written to the netlink socket, audit_pid
@@ -93,17 +96,19 @@ static int	audit_failure = AUDIT_FAIL_PRINTK;
 * the portid to use to send netlink messages to that process.
 */
 int             audit_pid;
-static int      audit_nlk_portid;
+static __u32    audit_nlk_portid;
 /* If audit_rate_limit is non-zero, limit the rate of sending audit records
 * to that number per second.  This prevents DoS attacks, but results in
 * audit records being dropped. */
-static int      audit_rate_limit;
+static u32      audit_rate_limit;
-/* Number of outstanding audit_buffers allowed. */
+/* Number of outstanding audit_buffers allowed.
-static int      audit_backlog_limit = 64;
+ * When set to zero, this means unlimited. */
-static int      audit_backlog_wait_time = 60 * HZ;
+static u32      audit_backlog_limit = 64;
-static int      audit_backlog_wait_overflow = 0;
+#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32      audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+static u32      audit_backlog_wait_overflow = 0;
 /* The identity of the user shutting down the audit system. */
 kuid_t          audit_sig_uid = INVALID_UID;
@@ -121,6 +126,7 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
+int audit_net_id;
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -175,27 +181,27 @@ struct audit_buffer {
 };
 struct audit_reply {
-        int pid;
+        __u32 portid;
+        pid_t pid;
        struct sk_buff *skb;
 };
-static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
 {
        if (ab) {
                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
-                nlh->nlmsg_pid = pid;
+                nlh->nlmsg_pid = portid;
        }
 }
 void audit_panic(const char *message)
 {
-        switch (audit_failure)
+        switch (audit_failure) {
-        {
        case AUDIT_FAIL_SILENT:
                break;
        case AUDIT_FAIL_PRINTK:
                if (printk_ratelimit())
-                        printk(KERN_ERR "audit: %s\n", message);
+                        pr_err("%s\n", message);
                break;
        case AUDIT_FAIL_PANIC:
                /* test audit_pid since printk is always losey, why bother? */
@@ -266,9 +272,7 @@ void audit_log_lost(const char *message)
        if (print) {
                if (printk_ratelimit())
-                        printk(KERN_WARNING
+                        pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
-                                "audit: audit_lost=%d audit_rate_limit=%d "
-                                "audit_backlog_limit=%d\n",
                                atomic_read(&audit_lost),
                                audit_rate_limit,
                                audit_backlog_limit);
@@ -276,7 +280,7 @@ void audit_log_lost(const char *message)
        }
 }
-static int audit_log_config_change(char *function_name, int new, int old,
+static int audit_log_config_change(char *function_name, u32 new, u32 old,
                                   int allow_changes)
 {
        struct audit_buffer *ab;
@@ -285,7 +289,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return rc;
-        audit_log_format(ab, "%s=%d old=%d", function_name, new, old);
+        audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
        audit_log_session_info(ab);
        rc = audit_log_task_context(ab);
        if (rc)
@@ -295,9 +299,10 @@ static int audit_log_config_change(char *function_name, int new, int old,
        return rc;
 }
-static int audit_do_config_change(char *function_name, int *to_change, int new)
+static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
 {
-        int allow_changes, rc = 0, old = *to_change;
+        int allow_changes, rc = 0;
+        u32 old = *to_change;
        /* check if we are locked */
        if (audit_enabled == AUDIT_LOCKED)
@@ -320,17 +325,23 @@ static int audit_do_config_change(char *function_name, int *to_change, int new)
        return rc;
 }
-static int audit_set_rate_limit(int limit)
+static int audit_set_rate_limit(u32 limit)
 {
        return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
 }
-static int audit_set_backlog_limit(int limit)
+static int audit_set_backlog_limit(u32 limit)
 {
        return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
 }
-static int audit_set_enabled(int state)
+static int audit_set_backlog_wait_time(u32 timeout)
+{
+        return audit_do_config_change("audit_backlog_wait_time",
+                                      &audit_backlog_wait_time, timeout);
+}
+static int audit_set_enabled(u32 state)
 {
        int rc;
        if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -343,7 +354,7 @@ static int audit_set_enabled(int state)
        return rc;
 }
-static int audit_set_failure(int state)
+static int audit_set_failure(u32 state)
 {
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
@@ -365,7 +376,8 @@ static int audit_set_failure(int state)
 static void audit_hold_skb(struct sk_buff *skb)
 {
        if (audit_default &&
-            skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
+            (!audit_backlog_limit ||
+             skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
                skb_queue_tail(&audit_skb_hold_queue, skb);
        else
                kfree_skb(skb);
@@ -382,7 +394,7 @@ static void audit_printk_skb(struct sk_buff *skb)
        if (nlh->nlmsg_type != AUDIT_EOE) {
                if (printk_ratelimit())
-                        printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+                        pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
                else
                        audit_log_lost("printk limit exceeded\n");
        }
@@ -398,9 +410,12 @@ static void kauditd_send_skb(struct sk_buff *skb)
        err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
        if (err < 0) {
                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
-                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+                if (audit_pid) {
-                audit_log_lost("auditd disappeared\n");
+                        pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
-                audit_pid = 0;
+                        audit_log_lost("auditd disappeared\n");
+                        audit_pid = 0;
+                        audit_sock = NULL;
+                }
                /* we might get lucky and get this in the next auditd */
                audit_hold_skb(skb);
        } else
@@ -457,8 +472,10 @@ static int kauditd_thread(void *dummy)
                flush_hold_queue();
                skb = skb_dequeue(&audit_skb_queue);
-                wake_up(&audit_backlog_wait);
                if (skb) {
+                        if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+                                wake_up(&audit_backlog_wait);
                        if (audit_pid)
                                kauditd_send_skb(skb);
                        else
@@ -482,22 +499,23 @@ static int kauditd_thread(void *dummy)
 int audit_send_list(void *_dest)
 {
        struct audit_netlink_list *dest = _dest;
-        int pid = dest->pid;
        struct sk_buff *skb;
+        struct net *net = get_net_ns_by_pid(dest->pid);
+        struct audit_net *aunet = net_generic(net, audit_net_id);
        /* wait for parent to finish and send an ACK */
        mutex_lock(&audit_cmd_mutex);
        mutex_unlock(&audit_cmd_mutex);
        while ((skb = __skb_dequeue(&dest->q)) != NULL)
-                netlink_unicast(audit_sock, skb, pid, 0);
+                netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
        kfree(dest);
        return 0;
 }
-struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
+struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
                                 int multi, const void *payload, int size)
 {
        struct sk_buff  *skb;
@@ -510,7 +528,7 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
        if (!skb)
                return NULL;
-        nlh     = nlmsg_put(skb, pid, seq, t, size, flags);
+        nlh     = nlmsg_put(skb, portid, seq, t, size, flags);
        if (!nlh)
                goto out_kfree_skb;
        data = nlmsg_data(nlh);
@@ -525,19 +543,21 @@ out_kfree_skb:
 static int audit_send_reply_thread(void *arg)
 {
        struct audit_reply *reply = (struct audit_reply *)arg;
+        struct net *net = get_net_ns_by_pid(reply->pid);
+        struct audit_net *aunet = net_generic(net, audit_net_id);
        mutex_lock(&audit_cmd_mutex);
        mutex_unlock(&audit_cmd_mutex);
        /* Ignore failure. It'll only happen if the sender goes away,
           because our timeout is set to infinite. */
-        netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
+        netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
        kfree(reply);
        return 0;
 }
 /**
 * audit_send_reply - send an audit reply message via netlink
- * @pid: process id to send reply to
+ * @portid: netlink port to which to send reply
 * @seq: sequence number
 * @type: audit message type
 * @done: done (last) flag
@@ -545,11 +565,11 @@ static int audit_send_reply_thread(void *arg)
 * @payload: payload data
 * @size: payload size
 *
- * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * Allocates an skb, builds the netlink message, and sends it to the port id.
 * No failure notifications.
 */
-static void audit_send_reply(int pid, int seq, int type, int done, int multi,
+static void audit_send_reply(__u32 portid, int seq, int type, int done,
-                             const void *payload, int size)
+                             int multi, const void *payload, int size)
 {
        struct sk_buff *skb;
        struct task_struct *tsk;
@@ -559,11 +579,12 @@ static void audit_send_reply(int pid, int seq, int type, int done, int multi,
        if (!reply)
                return;
-        skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
+        skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
        if (!skb)
                goto out;
-        reply->pid = pid;
+        reply->portid = portid;
+        reply->pid = task_pid_vnr(current);
        reply->skb = skb;
        tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -663,8 +684,12 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
 {
        struct audit_buffer *ab;
+        if (audit_enabled == AUDIT_OFF)
+                return;
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
-        audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d",
+        audit_log_task_info(ab, current);
+        audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
                         audit_feature_names[which], !!old_feature, !!new_feature,
                         !!old_lock, !!new_lock, res);
        audit_log_end(ab);
@@ -694,7 +719,7 @@ static int audit_set_feature(struct sk_buff *skb)
                old_lock = af.lock & feature;
                /* are we changing a locked feature? */
-                if ((af.lock & feature) && (new_feature != old_feature)) {
+                if (old_lock && (new_feature != old_feature)) {
                        audit_log_feature_change(i, old_feature, new_feature,
                                                 old_lock, new_lock, 0);
                        return -EPERM;
@@ -732,7 +757,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
        u32                     seq;
        void                    *data;
-        struct audit_status     *status_get, status_set;
        int                     err;
        struct audit_buffer     *ab;
        u16                     msg_type = nlh->nlmsg_type;
@@ -758,48 +782,70 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        data = nlmsg_data(nlh);
        switch (msg_type) {
-        case AUDIT_GET:
+        case AUDIT_GET: {
-                memset(&status_set, 0, sizeof(status_set));
+                struct audit_status     s;
-                status_set.enabled       = audit_enabled;
+                memset(&s, 0, sizeof(s));
-                status_set.failure       = audit_failure;
+                s.enabled               = audit_enabled;
-                status_set.pid           = audit_pid;
+                s.failure               = audit_failure;
-                status_set.rate_limit    = audit_rate_limit;
+                s.pid                   = audit_pid;
-                status_set.backlog_limit = audit_backlog_limit;
+                s.rate_limit            = audit_rate_limit;
-                status_set.lost          = atomic_read(&audit_lost);
+                s.backlog_limit         = audit_backlog_limit;
-                status_set.backlog       = skb_queue_len(&audit_skb_queue);
+                s.lost                  = atomic_read(&audit_lost);
+                s.backlog               = skb_queue_len(&audit_skb_queue);
+                s.version               = AUDIT_VERSION_LATEST;
+                s.backlog_wait_time     = audit_backlog_wait_time;
                audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
-                                 &status_set, sizeof(status_set));
+                                 &s, sizeof(s));
                break;
-        case AUDIT_SET:
+        }
-                if (nlmsg_len(nlh) < sizeof(struct audit_status))
+        case AUDIT_SET: {
-                        return -EINVAL;
+                struct audit_status     s;
-                status_get   = (struct audit_status *)data;
+                memset(&s, 0, sizeof(s));
-                if (status_get->mask & AUDIT_STATUS_ENABLED) {
+                /* guard against past and future API changes */
-                        err = audit_set_enabled(status_get->enabled);
+                memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+                if (s.mask & AUDIT_STATUS_ENABLED) {
+                        err = audit_set_enabled(s.enabled);
                        if (err < 0)
                                return err;
                }
-                if (status_get->mask & AUDIT_STATUS_FAILURE) {
+                if (s.mask & AUDIT_STATUS_FAILURE) {
-                        err = audit_set_failure(status_get->failure);
+                        err = audit_set_failure(s.failure);
                        if (err < 0)
                                return err;
                }
-                if (status_get->mask & AUDIT_STATUS_PID) {
+                if (s.mask & AUDIT_STATUS_PID) {
-                        int new_pid = status_get->pid;
+                        int new_pid = s.pid;
+                        if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
+                                return -EACCES;
                        if (audit_enabled != AUDIT_OFF)
                                audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
                        audit_pid = new_pid;
                        audit_nlk_portid = NETLINK_CB(skb).portid;
+                        audit_sock = skb->sk;
                }
-                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
+                if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
-                        err = audit_set_rate_limit(status_get->rate_limit);
+                        err = audit_set_rate_limit(s.rate_limit);
+                        if (err < 0)
+                                return err;
+                }
+                if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
+                        err = audit_set_backlog_limit(s.backlog_limit);
+                        if (err < 0)
+                                return err;
+                }
+                if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
+                        if (sizeof(s) > (size_t)nlh->nlmsg_len)
+                                return -EINVAL;
+                        if (s.backlog_wait_time < 0 ||
+                            s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
+                                return -EINVAL;
+                        err = audit_set_backlog_wait_time(s.backlog_wait_time);
                        if (err < 0)
                                return err;
                }
-                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
-                        err = audit_set_backlog_limit(status_get->backlog_limit);
                break;
+        }
        case AUDIT_GET_FEATURE:
                err = audit_get_feature(skb);
                if (err)
@@ -817,13 +863,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        return 0;
                err = audit_filter_user(msg_type);
-                if (err == 1) {
+                if (err == 1) { /* match or error */
                        err = 0;
                        if (msg_type == AUDIT_USER_TTY) {
                                err = tty_audit_push_current();
                                if (err)
                                        break;
                        }
+                        mutex_unlock(&audit_cmd_mutex);
                        audit_log_common_recv_msg(&ab, msg_type);
                        if (msg_type != AUDIT_USER_TTY)
                                audit_log_format(ab, " msg='%.*s'",
@@ -839,8 +886,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                        size--;
                                audit_log_n_untrustedstring(ab, data, size);
                        }
-                        audit_set_pid(ab, NETLINK_CB(skb).portid);
+                        audit_set_portid(ab, NETLINK_CB(skb).portid);
                        audit_log_end(ab);
+                        mutex_lock(&audit_cmd_mutex);
                }
                break;
        case AUDIT_ADD_RULE:
@@ -853,11 +901,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        audit_log_end(ab);
                        return -EPERM;
                }
-                /* fallthrough */
+                err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
-        case AUDIT_LIST_RULES:
-                err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
                                           seq, data, nlmsg_len(nlh));
                break;
+        case AUDIT_LIST_RULES:
+                err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
+                break;
        case AUDIT_TRIM:
                audit_trim_trees();
                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
@@ -939,20 +988,33 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                break;
        }
        case AUDIT_TTY_SET: {
-                struct audit_tty_status s;
+                struct audit_tty_status s, old;
                struct task_struct *tsk = current;
+                struct audit_buffer     *ab;
                memset(&s, 0, sizeof(s));
                /* guard against past and future API changes */
                memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+                /* check if new data is valid */
                if ((s.enabled != 0 && s.enabled != 1) ||
                    (s.log_passwd != 0 && s.log_passwd != 1))
-                        return -EINVAL;
+                        err = -EINVAL;
                spin_lock(&tsk->sighand->siglock);
-                tsk->signal->audit_tty = s.enabled;
+                old.enabled = tsk->signal->audit_tty;
-                tsk->signal->audit_tty_log_passwd = s.log_passwd;
+                old.log_passwd = tsk->signal->audit_tty_log_passwd;
+                if (!err) {
+                        tsk->signal->audit_tty = s.enabled;
+                        tsk->signal->audit_tty_log_passwd = s.log_passwd;
+                }
                spin_unlock(&tsk->sighand->siglock);
+                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+                audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
+                                 " old-log_passwd=%d new-log_passwd=%d res=%d",
+                                 old.enabled, s.enabled, old.log_passwd,
+                                 s.log_passwd, !err);
+                audit_log_end(ab);
                break;
        }
        default:
@@ -998,24 +1060,55 @@ static void audit_receive(struct sk_buff  *skb)
        mutex_unlock(&audit_cmd_mutex);
 }
-/* Initialize audit support at boot time. */
+static int __net_init audit_net_init(struct net *net)
-static int __init audit_init(void)
 {
-        int i;
        struct netlink_kernel_cfg cfg = {
                .input  = audit_receive,
        };
+        struct audit_net *aunet = net_generic(net, audit_net_id);
+        aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
+        if (aunet->nlsk == NULL) {
+                audit_panic("cannot initialize netlink socket in namespace");
+                return -ENOMEM;
+        }
+        aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+        return 0;
+}
+static void __net_exit audit_net_exit(struct net *net)
+{
+        struct audit_net *aunet = net_generic(net, audit_net_id);
+        struct sock *sock = aunet->nlsk;
+        if (sock == audit_sock) {
+                audit_pid = 0;
+                audit_sock = NULL;
+        }
+        rcu_assign_pointer(aunet->nlsk, NULL);
+        synchronize_net();
+        netlink_kernel_release(sock);
+}
+static struct pernet_operations audit_net_ops __net_initdata = {
+        .init = audit_net_init,
+        .exit = audit_net_exit,
+        .id = &audit_net_id,
+        .size = sizeof(struct audit_net),
+};
+/* Initialize audit support at boot time. */
+static int __init audit_init(void)
+{
+        int i;
        if (audit_initialized == AUDIT_DISABLED)
                return 0;
-        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+        pr_info("initializing netlink subsys (%s)\n",
-               audit_default ? "enabled" : "disabled");
+                audit_default ? "enabled" : "disabled");
-        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg);
+        register_pernet_subsys(&audit_net_ops);
-        if (!audit_sock)
-                audit_panic("cannot initialize netlink socket");
-        else
-                audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
        skb_queue_head_init(&audit_skb_queue);
        skb_queue_head_init(&audit_skb_hold_queue);
@@ -1039,22 +1132,32 @@ static int __init audit_enable(char *str)
        if (!audit_default)
                audit_initialized = AUDIT_DISABLED;
-        printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+        pr_info("%s\n", audit_default ?
+                "enabled (after initialization)" : "disabled (until reboot)");
-        if (audit_initialized == AUDIT_INITIALIZED) {
+        return 1;
-                audit_enabled = audit_default;
+}
-                audit_ever_enabled |= !!audit_default;
+__setup("audit=", audit_enable);
-        } else if (audit_initialized == AUDIT_UNINITIALIZED) {
-                printk(" (after initialization)");
+/* Process kernel command-line parameter at boot time.
-        } else {
+ * audit_backlog_limit=<n> */
-                printk(" (until reboot)");
+static int __init audit_backlog_limit_set(char *str)
+{
+        u32 audit_backlog_limit_arg;
+        pr_info("audit_backlog_limit: ");
+        if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
+                pr_cont("using default of %u, unable to parse %s\n",
+                        audit_backlog_limit, str);
+                return 1;
        }
-        printk("\n");
+        audit_backlog_limit = audit_backlog_limit_arg;
+        pr_cont("%d\n", audit_backlog_limit);
        return 1;
 }
+__setup("audit_backlog_limit=", audit_backlog_limit_set);
-__setup("audit=", audit_enable);
 static void audit_buffer_free(struct audit_buffer *ab)
 {
@@ -1165,18 +1268,20 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 /*
 * Wait for auditd to drain the queue a little
 */
-static void wait_for_auditd(unsigned long sleep_time)
+static long wait_for_auditd(long sleep_time)
 {
        DECLARE_WAITQUEUE(wait, current);
        set_current_state(TASK_UNINTERRUPTIBLE);
-        add_wait_queue(&audit_backlog_wait, &wait);
+        add_wait_queue_exclusive(&audit_backlog_wait, &wait);
        if (audit_backlog_limit &&
            skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
-                schedule_timeout(sleep_time);
+                sleep_time = schedule_timeout(sleep_time);
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&audit_backlog_wait, &wait);
+        return sleep_time;
 }
 /**
@@ -1200,7 +1305,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        struct audit_buffer     *ab     = NULL;
        struct timespec         t;
        unsigned int            uninitialized_var(serial);
-        int reserve;
+        int reserve = 5; /* Allow atomic callers to go up to five
+                            entries over the normal backlog limit */
        unsigned long timeout_start = jiffies;
        if (audit_initialized != AUDIT_INITIALIZED)
@@ -1209,36 +1315,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        if (unlikely(audit_filter_type(type)))
                return NULL;
-        if (gfp_mask & __GFP_WAIT)
+        if (gfp_mask & __GFP_WAIT) {
-                reserve = 0;
+                if (audit_pid && audit_pid == current->pid)
-        else
+                        gfp_mask &= ~__GFP_WAIT;
-                reserve = 5; /* Allow atomic callers to go up to five
+                else
-                                entries over the normal backlog limit */
+                        reserve = 0;
+        }
        while (audit_backlog_limit
               && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
                if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
-                        unsigned long sleep_time;
+                        long sleep_time;
-                        sleep_time = timeout_start + audit_backlog_wait_time -
+                        sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
-                                        jiffies;
+                        if (sleep_time > 0) {
-                        if ((long)sleep_time > 0) {
+                                sleep_time = wait_for_auditd(sleep_time);
-                                wait_for_auditd(sleep_time);
+                                if (sleep_time > 0)
-                                continue;
+                                        continue;
                        }
                }
                if (audit_rate_check() && printk_ratelimit())
-                        printk(KERN_WARNING
+                        pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
-                               "audit: audit_backlog=%d > "
+                                skb_queue_len(&audit_skb_queue),
-                               "audit_backlog_limit=%d\n",
+                                audit_backlog_limit);
-                               skb_queue_len(&audit_skb_queue),
-                               audit_backlog_limit);
                audit_log_lost("backlog limit exceeded");
                audit_backlog_wait_time = audit_backlog_wait_overflow;
                wake_up(&audit_backlog_wait);
                return NULL;
        }
+        audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
        ab = audit_buffer_alloc(ctx, gfp_mask, type);
        if (!ab) {
                audit_log_lost("out of memory in audit_log_start");
@@ -1356,7 +1463,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
        int i, avail, new_len;
        unsigned char *ptr;
        struct sk_buff *skb;
-        static const unsigned char *hex = "0123456789ABCDEF";
        if (!ab)
                return;
@@ -1374,10 +1480,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
        }
        ptr = skb_tail_pointer(skb);
-        for (i=0; i<len; i++) {
+        for (i = 0; i < len; i++)
-                *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
+                ptr = hex_byte_pack_upper(ptr, buf[i]);
-                *ptr++ = hex[buf[i] & 0x0F];      /* Lower nibble */
-        }
        *ptr = 0;
        skb_put(skb, len << 1); /* new string is twice the old string */
 }
@@ -1491,7 +1595,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 void audit_log_session_info(struct audit_buffer *ab)
 {
-        u32 sessionid = audit_get_sessionid(current);
+        unsigned int sessionid = audit_get_sessionid(current);
        uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
        audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
@@ -1716,7 +1820,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
        audit_log_format(ab,
                         " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
                         " euid=%u suid=%u fsuid=%u"
-                         " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
+                         " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
                         sys_getppid(),
                         tsk->pid,
                         from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
@@ -1728,7 +1832,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
                         from_kgid(&init_user_ns, cred->egid),
                         from_kgid(&init_user_ns, cred->sgid),
                         from_kgid(&init_user_ns, cred->fsgid),
-                         audit_get_sessionid(tsk), tty);
+                         tty, audit_get_sessionid(tsk));
        get_task_comm(name, tsk);
        audit_log_format(ab, " comm=");
@@ -1739,7 +1843,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
                if (mm->exe_file)
                        audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
                up_read(&mm->mmap_sem);
-        }
+        } else
+                audit_log_format(ab, " exe=(null)");
        audit_log_task_context(ab);
 }
 EXPORT_SYMBOL(audit_log_task_info);
diff --git a/kernel/audit.h b/kernel/audit.h
index b779642b29af..57cc64d67718 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -209,7 +209,7 @@ struct audit_context {
 #endif
 };
-extern int audit_ever_enabled;
+extern u32 audit_ever_enabled;
 extern void audit_copy_inode(struct audit_names *name,
                             const struct dentry *dentry,
@@ -240,18 +240,23 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
 extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
 extern int parent_len(const char *path);
 extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
-extern struct sk_buff *     audit_make_reply(int pid, int seq, int type,
+extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
-                                             int done, int multi,
+                                        int done, int multi,
-                                             const void *payload, int size);
+                                        const void *payload, int size);
 extern void                 audit_panic(const char *message);
 struct audit_netlink_list {
-        int pid;
+        __u32 portid;
+        pid_t pid;
        struct sk_buff_head q;
 };
 int audit_send_list(void *);
+struct audit_net {
+        struct sock *nlsk;
+};
 extern int selinux_audit_rule_update(void);
 extern struct mutex audit_filter_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 43c307dc9453..67ccf0e7cca9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk)
 }
 static int audit_tree_handle_event(struct fsnotify_group *group,
+                                   struct inode *to_tell,
                                   struct fsnotify_mark *inode_mark,
-                                   struct fsnotify_mark *vfsmonut_mark,
+                                   struct fsnotify_mark *vfsmount_mark,
-                                   struct fsnotify_event *event)
+                                   u32 mask, void *data, int data_type,
+                                   const unsigned char *file_name)
 {
-        BUG();
+        return 0;
-        return -EOPNOTSUPP;
 }
 static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
@@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
        BUG_ON(atomic_read(&entry->refcnt) < 1);
 }
-static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
-                                  struct fsnotify_mark *inode_mark,
-                                  struct fsnotify_mark *vfsmount_mark,
-                                  __u32 mask, void *data, int data_type)
-{
-        return false;
-}
 static const struct fsnotify_ops audit_tree_ops = {
        .handle_event = audit_tree_handle_event,
-        .should_send_event = audit_tree_send_event,
-        .free_group_priv = NULL,
-        .free_event_priv = NULL,
        .freeing_mark = audit_tree_freeing_mark,
 };
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 22831c4d369c..2596fac5dcb4 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule)
        }
 }
-static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-                                          struct fsnotify_mark *inode_mark,
-                                          struct fsnotify_mark *vfsmount_mark,
-                                          __u32 mask, void *data, int data_type)
-{
-       return true;
-}
 /* Update watch data in audit rules based on fsnotify events. */
 static int audit_watch_handle_event(struct fsnotify_group *group,
+                                    struct inode *to_tell,
                                    struct fsnotify_mark *inode_mark,
                                    struct fsnotify_mark *vfsmount_mark,
-                                    struct fsnotify_event *event)
+                                    u32 mask, void *data, int data_type,
+                                    const unsigned char *dname)
 {
        struct inode *inode;
-        __u32 mask = event->mask;
-        const char *dname = event->file_name;
        struct audit_parent *parent;
        parent = container_of(inode_mark, struct audit_parent, mark);
        BUG_ON(group != audit_watch_group);
-        switch (event->data_type) {
+        switch (data_type) {
        case (FSNOTIFY_EVENT_PATH):
-                inode = event->path.dentry->d_inode;
+                inode = ((struct path *)data)->dentry->d_inode;
                break;
        case (FSNOTIFY_EVENT_INODE):
-                inode = event->inode;
+                inode = (struct inode *)data;
                break;
        default:
                BUG();
@@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 }
 static const struct fsnotify_ops audit_watch_fsnotify_ops = {
-        .should_send_event =    audit_watch_should_send_event,
        .handle_event =         audit_watch_handle_event,
-        .free_group_priv =      NULL,
-        .freeing_mark =         NULL,
-        .free_event_priv =      NULL,
 };
 static int __init audit_watch_init(void)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 51f3fd4c1ed3..14a78cca384e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -972,7 +972,7 @@ out:
 }
 /* List rules using struct audit_rule_data. */
-static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
 {
        struct sk_buff *skb;
        struct audit_krule *r;
@@ -987,14 +987,15 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
                        data = audit_krule_to_data(r);
                        if (unlikely(!data))
                                break;
-                        skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+                        skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
-                                         data, sizeof(*data) + data->buflen);
+                                               0, 1, data,
+                                               sizeof(*data) + data->buflen);
                        if (skb)
                                skb_queue_tail(q, skb);
                        kfree(data);
                }
        }
-        skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+        skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
        if (skb)
                skb_queue_tail(q, skb);
 }
@@ -1004,7 +1005,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
 {
        struct audit_buffer *ab;
        uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
-        u32 sessionid = audit_get_sessionid(current);
+        unsigned int sessionid = audit_get_sessionid(current);
        if (!audit_enabled)
                return;
@@ -1022,45 +1023,20 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
 }
 /**
- * audit_receive_filter - apply all rules to the specified message type
+ * audit_rule_change - apply all rules to the specified message type
 * @type: audit message type
- * @pid: target pid for netlink audit messages
+ * @portid: target port id for netlink audit messages
 * @seq: netlink audit message sequence (serial) number
 * @data: payload data
 * @datasz: size of payload data
 */
-int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
+int audit_rule_change(int type, __u32 portid, int seq, void *data,
+                        size_t datasz)
 {
-        struct task_struct *tsk;
-        struct audit_netlink_list *dest;
        int err = 0;
        struct audit_entry *entry;
        switch (type) {
-        case AUDIT_LIST_RULES:
-                /* We can't just spew out the rules here because we might fill
-                 * the available socket buffer space and deadlock waiting for
-                 * auditctl to read from it... which isn't ever going to
-                 * happen if we're actually running in the context of auditctl
-                 * trying to _send_ the stuff */
-                dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
-                if (!dest)
-                        return -ENOMEM;
-                dest->pid = pid;
-                skb_queue_head_init(&dest->q);
-                mutex_lock(&audit_filter_mutex);
-                audit_list_rules(pid, seq, &dest->q);
-                mutex_unlock(&audit_filter_mutex);
-                tsk = kthread_run(audit_send_list, dest, "audit_send_list");
-                if (IS_ERR(tsk)) {
-                        skb_queue_purge(&dest->q);
-                        kfree(dest);
-                        err = PTR_ERR(tsk);
-                }
-                break;
        case AUDIT_ADD_RULE:
                entry = audit_data_to_entry(data, datasz);
                if (IS_ERR(entry))
@@ -1087,6 +1063,44 @@ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
        return err;
 }
+/**
+ * audit_list_rules_send - list the audit rules
+ * @portid: target portid for netlink audit messages
+ * @seq: netlink audit message sequence (serial) number
+ */
+int audit_list_rules_send(__u32 portid, int seq)
+{
+        struct task_struct *tsk;
+        struct audit_netlink_list *dest;
+        int err = 0;
+        /* We can't just spew out the rules here because we might fill
+         * the available socket buffer space and deadlock waiting for
+         * auditctl to read from it... which isn't ever going to
+         * happen if we're actually running in the context of auditctl
+         * trying to _send_ the stuff */
+        dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+        if (!dest)
+                return -ENOMEM;
+        dest->portid = portid;
+        dest->pid = task_pid_vnr(current);
+        skb_queue_head_init(&dest->q);
+        mutex_lock(&audit_filter_mutex);
+        audit_list_rules(portid, seq, &dest->q);
+        mutex_unlock(&audit_filter_mutex);
+        tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+        if (IS_ERR(tsk)) {
+                skb_queue_purge(&dest->q);
+                kfree(dest);
+                err = PTR_ERR(tsk);
+        }
+        return err;
+}
 int audit_comparator(u32 left, u32 op, u32 right)
 {
        switch (op) {
@@ -1276,19 +1290,22 @@ int audit_filter_user(int type)
 {
        enum audit_state state = AUDIT_DISABLED;
        struct audit_entry *e;
-        int ret = 1;
+        int rc, ret;
+        ret = 1; /* Audit by default */
        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-                if (audit_filter_user_rules(&e->rule, type, &state)) {
+                rc = audit_filter_user_rules(&e->rule, type, &state);
-                        if (state == AUDIT_DISABLED)
+                if (rc) {
+                        if (rc > 0 && state == AUDIT_DISABLED)
                                ret = 0;
                        break;
                }
        }
        rcu_read_unlock();
-        return ret; /* Audit by default */
+        return ret;
 }
 int audit_filter_type(int type)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 90594c9f7552..10176cd5956a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1969,18 +1969,24 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
                                   int rc)
 {
        struct audit_buffer *ab;
-        uid_t uid, ologinuid, nloginuid;
+        uid_t uid, oldloginuid, loginuid;
+        if (!audit_enabled)
+                return;
        uid = from_kuid(&init_user_ns, task_uid(current));
-        ologinuid = from_kuid(&init_user_ns, koldloginuid);
+        oldloginuid = from_kuid(&init_user_ns, koldloginuid);
-        nloginuid = from_kuid(&init_user_ns, kloginuid),
+        loginuid = from_kuid(&init_user_ns, kloginuid),
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
        if (!ab)
                return;
-        audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old "
+        audit_log_format(ab, "pid=%d uid=%u"
-                         "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid,
+                         " old-auid=%u new-auid=%u old-ses=%u new-ses=%u"
-                         nloginuid, oldsessionid, sessionid, !rc);
+                         " res=%d",
+                         current->pid, uid,
+                         oldloginuid, loginuid, oldsessionid, sessionid,
+                         !rc);
        audit_log_end(ab);
 }
@@ -2008,7 +2014,7 @@ int audit_set_loginuid(kuid_t loginuid)
        /* are we setting or clearing? */
        if (uid_valid(loginuid))
-                sessionid = atomic_inc_return(&session_id);
+                sessionid = (unsigned int)atomic_inc_return(&session_id);
        task->sessionid = sessionid;
        task->loginuid = loginuid;
@@ -2321,18 +2327,16 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 /**
 * __audit_log_capset - store information about the arguments to the capset syscall
- * @pid: target pid of the capset call
 * @new: the new credentials
 * @old: the old (current) credentials
 *
 * Record the aguments userspace sent to sys_capset for later printing by the
 * audit system if applicable
 */
-void __audit_log_capset(pid_t pid,
+void __audit_log_capset(const struct cred *new, const struct cred *old)
-                       const struct cred *new, const struct cred *old)
 {
        struct audit_context *context = current->audit_context;
-        context->capset.pid = pid;
+        context->capset.pid = task_pid_nr(current);
        context->capset.cap.effective   = new->cap_effective;
        context->capset.cap.inheritable = new->cap_effective;
        context->capset.cap.permitted   = new->cap_permitted;
@@ -2352,6 +2356,7 @@ static void audit_log_task(struct audit_buffer *ab)
        kuid_t auid, uid;
        kgid_t gid;
        unsigned int sessionid;
+        struct mm_struct *mm = current->mm;
        auid = audit_get_loginuid(current);
        sessionid = audit_get_sessionid(current);
@@ -2365,15 +2370,15 @@ static void audit_log_task(struct audit_buffer *ab)
        audit_log_task_context(ab);
        audit_log_format(ab, " pid=%d comm=", current->pid);
        audit_log_untrustedstring(ab, current->comm);
+        if (mm) {
+                down_read(&mm->mmap_sem);
+                if (mm->exe_file)
+                        audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+                up_read(&mm->mmap_sem);
+        } else
+                audit_log_format(ab, " exe=(null)");
 }
-static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
-{
-        audit_log_task(ab);
-        audit_log_format(ab, " reason=");
-        audit_log_string(ab, reason);
-        audit_log_format(ab, " sig=%ld", signr);
-}
 /**
 * audit_core_dumps - record information about processes that end abnormally
 * @signr: signal value
@@ -2394,7 +2399,8 @@ void audit_core_dumps(long signr)
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
        if (unlikely(!ab))
                return;
-        audit_log_abend(ab, "memory violation", signr);
+        audit_log_task(ab);
+        audit_log_format(ab, " sig=%ld", signr);
        audit_log_end(ab);
 }
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 5253204afdca..9fd4246b04b8 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,6 @@ void foo(void)
 #ifdef CONFIG_SMP
        DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
 #endif
-        DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int));
+        DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
        /* End of constants */
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e66bf9275b0..34019c57888d 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
        if (ret < 0)
                goto error;
-        audit_log_capset(pid, new, current_cred());
+        audit_log_capset(new, current_cred());
        return commit_creds(new);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4c62513fe19f..e2f46ba37f72 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/backing-dev.h>
-#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/magic.h>
 #include <linux/spinlock.h>
@@ -56,15 +55,20 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/eventfd.h>
-#include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
-#include <linux/file.h>
 #include <linux/atomic.h>
 /*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY    HZ
+/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
@@ -89,6 +93,33 @@ static DEFINE_MUTEX(cgroup_mutex);
 static DEFINE_MUTEX(cgroup_root_mutex);
+#define cgroup_assert_mutex_or_rcu_locked()                             \
+        rcu_lockdep_assert(rcu_read_lock_held() ||                      \
+                           lockdep_is_held(&cgroup_mutex),              \
+                           "cgroup_mutex or RCU read lock required");
+#ifdef CONFIG_LOCKDEP
+#define cgroup_assert_mutex_or_root_locked()                            \
+        WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
+                                     !lockdep_is_held(&cgroup_root_mutex)))
+#else
+#define cgroup_assert_mutex_or_root_locked()    do { } while (0)
+#endif
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 /*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
 * populated with the built in subsystems, and modular subsystems are
@@ -111,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
 /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
 static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
-/*
- * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
- */
-struct cfent {
-        struct list_head                node;
-        struct dentry                   *dentry;
-        struct cftype                   *type;
-        struct cgroup_subsys_state      *css;
-        /* file xattrs */
-        struct simple_xattrs            xattrs;
-};
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct cgroup_event {
-        /*
-         * css which the event belongs to.
-         */
-        struct cgroup_subsys_state *css;
-        /*
-         * Control file which the event associated.
-         */
-        struct cftype *cft;
-        /*
-         * eventfd to signal userspace about the event.
-         */
-        struct eventfd_ctx *eventfd;
-        /*
-         * Each of these stored in a list by the cgroup.
-         */
-        struct list_head list;
-        /*
-         * All fields below needed to unregister event when
-         * userspace closes eventfd.
-         */
-        poll_table pt;
-        wait_queue_head_t *wqh;
-        wait_queue_t wait;
-        struct work_struct remove;
-};
 /* The list of hierarchy roots */
 static LIST_HEAD(cgroup_roots);
@@ -191,6 +179,8 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
+static int cgroup_file_release(struct inode *inode, struct file *file);
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 /**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -253,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp)
 }
 /**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_mutex.
+ */
+#define for_each_css(css, ssid, cgrp)                                   \
+        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
+                if (!((css) = rcu_dereference_check(                    \
+                                (cgrp)->subsys[(ssid)],                 \
+                                lockdep_is_held(&cgroup_mutex)))) { }   \
+                else
+/**
 * for_each_subsys - iterate all loaded cgroup subsystems
 * @ss: the iteration cursor
- * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 *
- * Should be called under cgroup_mutex.
+ * Iterates through all loaded subsystems.  Should be called under
+ * cgroup_mutex or cgroup_root_mutex.
 */
-#define for_each_subsys(ss, i)                                          \
+#define for_each_subsys(ss, ssid)                                       \
-        for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++)                 \
+        for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; });   \
-                if (({ lockdep_assert_held(&cgroup_mutex);              \
+             (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)                    \
-                       !((ss) = cgroup_subsys[i]); })) { }              \
+                if (!((ss) = cgroup_subsys[(ssid)])) { }                \
                else
 /**
@@ -277,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
        for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT &&              \
             (((ss) = cgroup_subsys[i]) || true); (i)++)
-/* iterate each subsystem attached to a hierarchy */
-#define for_each_root_subsys(root, ss)                                  \
-        list_for_each_entry((ss), &(root)->subsys_list, sibling)
 /* iterate across the active hierarchies */
 #define for_each_active_root(root)                                      \
        list_for_each_entry((root), &cgroup_roots, root_list)
@@ -854,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work)
         */
        deactivate_super(cgrp->root->sb);
-        /*
+        cgroup_pidlist_destroy_all(cgrp);
-         * if we're getting rid of the cgroup, refcount should ensure
-         * that there are no pidlists left.
-         */
-        BUG_ON(!list_empty(&cgrp->pidlists));
        simple_xattrs_free(&cgrp->xattrs);
@@ -871,7 +869,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
        INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
-        schedule_work(&cgrp->destroy_work);
+        queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -881,6 +879,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                struct cgroup *cgrp = dentry->d_fsdata;
                BUG_ON(!(cgroup_is_dead(cgrp)));
+                /*
+                 * XXX: cgrp->id is only used to look up css's.  As cgroup
+                 * and css's lifetimes will be decoupled, it should be made
+                 * per-subsystem and moved to css->id so that lookups are
+                 * successful until the target css is released.
+                 */
+                idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+                cgrp->id = -1;
                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
@@ -1031,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                                           cgroup_css(cgroup_dummy_top, ss));
                        cgroup_css(cgrp, ss)->cgroup = cgrp;
-                        list_move(&ss->sibling, &root->subsys_list);
                        ss->root = root;
                        if (ss->bind)
                                ss->bind(cgroup_css(cgrp, ss));
@@ -1050,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        RCU_INIT_POINTER(cgrp->subsys[i], NULL);
                        cgroup_subsys[i]->root = &cgroup_dummy_root;
-                        list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
                        /* subsystem is now free - drop reference on module */
                        module_put(ss->module);
@@ -1077,10 +1083,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 {
        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
        struct cgroup_subsys *ss;
+        int ssid;
        mutex_lock(&cgroup_root_mutex);
-        for_each_root_subsys(root, ss)
+        for_each_subsys(ss, ssid)
-                seq_printf(seq, ",%s", ss->name);
+                if (root->subsys_mask & (1 << ssid))
+                        seq_printf(seq, ",%s", ss->name);
        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
                seq_puts(seq, ",sane_behavior");
        if (root->flags & CGRP_ROOT_NOPREFIX)
@@ -1343,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->dummy_css.cgroup = cgrp;
-        INIT_LIST_HEAD(&cgrp->event_list);
-        spin_lock_init(&cgrp->event_list_lock);
        simple_xattrs_init(&cgrp->xattrs);
 }
@@ -1352,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 {
        struct cgroup *cgrp = &root->top_cgroup;
-        INIT_LIST_HEAD(&root->subsys_list);
        INIT_LIST_HEAD(&root->root_list);
        root->number_of_cgroups = 1;
        cgrp->root = root;
@@ -1674,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        return ERR_PTR(ret);
 }
-static void cgroup_kill_sb(struct super_block *sb) {
+static void cgroup_kill_sb(struct super_block *sb)
+{
        struct cgroupfs_root *root = sb->s_fs_info;
        struct cgroup *cgrp = &root->top_cgroup;
        struct cgrp_cset_link *link, *tmp_link;
@@ -1957,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
                              bool threadgroup)
 {
        int retval, i, group_size;
-        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroupfs_root *root = cgrp->root;
+        struct cgroup_subsys_state *css, *failed_css = NULL;
        /* threadgroup list cursor and array */
        struct task_struct *leader = tsk;
        struct task_and_cgroup *tc;
@@ -2031,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
        /*
         * step 1: check that we can legitimately attach to the cgroup.
         */
-        for_each_root_subsys(root, ss) {
+        for_each_css(css, i, cgrp) {
-                struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+                if (css->ss->can_attach) {
+                        retval = css->ss->can_attach(css, &tset);
-                if (ss->can_attach) {
-                        retval = ss->can_attach(css, &tset);
                        if (retval) {
-                                failed_ss = ss;
+                                failed_css = css;
                                goto out_cancel_attach;
                        }
                }
@@ -2073,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
        /*
         * step 4: do subsystem attach callbacks.
         */
-        for_each_root_subsys(root, ss) {
+        for_each_css(css, i, cgrp)
-                struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+                if (css->ss->attach)
+                        css->ss->attach(css, &tset);
-                if (ss->attach)
-                        ss->attach(css, &tset);
-        }
        /*
         * step 5: success! and cleanup
@@ -2095,13 +2096,11 @@ out_put_css_set_refs:
        }
 out_cancel_attach:
        if (retval) {
-                for_each_root_subsys(root, ss) {
+                for_each_css(css, i, cgrp) {
-                        struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+                        if (css == failed_css)
-                        if (ss == failed_ss)
                                break;
-                        if (ss->cancel_attach)
+                        if (css->ss->cancel_attach)
-                                ss->cancel_attach(css, &tset);
+                                css->ss->cancel_attach(css, &tset);
                }
        }
 out_free_group_list:
@@ -2129,7 +2128,7 @@ retry_find_task:
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
                        rcu_read_unlock();
-                        ret= -ESRCH;
+                        ret = -ESRCH;
                        goto out_unlock_cgroup;
                }
                /*
@@ -2241,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
        return 0;
 }
-static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-                                     struct cftype *cft, struct seq_file *seq)
 {
-        struct cgroup *cgrp = css->cgroup;
+        struct cgroup *cgrp = seq_css(seq)->cgroup;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
@@ -2254,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
        return 0;
 }
-static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-                                     struct cftype *cft, struct seq_file *seq)
 {
-        seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
+        struct cgroup *cgrp = seq_css(seq)->cgroup;
+        seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
        return 0;
 }
 /* A buffer size big enough for numbers or short strings */
 #define CGROUP_LOCAL_BUFFER_SIZE 64
-static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
+static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
-                                struct cftype *cft, struct file *file,
+                                 size_t nbytes, loff_t *ppos)
-                                const char __user *userbuf, size_t nbytes,
-                                loff_t *unused_ppos)
 {
-        char buffer[CGROUP_LOCAL_BUFFER_SIZE];
+        struct cfent *cfe = __d_cfe(file->f_dentry);
-        int retval = 0;
+        struct cftype *cft = __d_cft(file->f_dentry);
-        char *end;
+        struct cgroup_subsys_state *css = cfe->css;
+        size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
+        char *buf;
+        int ret;
-        if (!nbytes)
+        if (nbytes >= max_bytes)
-                return -EINVAL;
-        if (nbytes >= sizeof(buffer))
                return -E2BIG;
-        if (copy_from_user(buffer, userbuf, nbytes))
-                return -EFAULT;
-        buffer[nbytes] = 0;     /* nul-terminate */
+        buf = kmalloc(nbytes + 1, GFP_KERNEL);
-        if (cft->write_u64) {
+        if (!buf)
-                u64 val = simple_strtoull(strstrip(buffer), &end, 0);
+                return -ENOMEM;
-                if (*end)
-                        return -EINVAL;
+        if (copy_from_user(buf, userbuf, nbytes)) {
-                retval = cft->write_u64(css, cft, val);
+                ret = -EFAULT;
+                goto out_free;
+        }
+        buf[nbytes] = '\0';
+        if (cft->write_string) {
+                ret = cft->write_string(css, cft, strstrip(buf));
+        } else if (cft->write_u64) {
+                unsigned long long v;
+                ret = kstrtoull(buf, 0, &v);
+                if (!ret)
+                        ret = cft->write_u64(css, cft, v);
+        } else if (cft->write_s64) {
+                long long v;
+                ret = kstrtoll(buf, 0, &v);
+                if (!ret)
+                        ret = cft->write_s64(css, cft, v);
+        } else if (cft->trigger) {
+                ret = cft->trigger(css, (unsigned int)cft->private);
        } else {
-                s64 val = simple_strtoll(strstrip(buffer), &end, 0);
+                ret = -EINVAL;
-                if (*end)
-                        return -EINVAL;
-                retval = cft->write_s64(css, cft, val);
        }
-        if (!retval)
+out_free:
-                retval = nbytes;
+        kfree(buf);
-        return retval;
+        return ret ?: nbytes;
 }
-static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
+/*
-                                   struct cftype *cft, struct file *file,
+ * seqfile ops/methods for returning structured data. Currently just
-                                   const char __user *userbuf, size_t nbytes,
+ * supports string->u64 maps, but can be extended in future.
-                                   loff_t *unused_ppos)
+ */
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
 {
-        char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
+        struct cftype *cft = seq_cft(seq);
-        int retval = 0;
-        size_t max_bytes = cft->max_write_len;
-        char *buffer = local_buffer;
-        if (!max_bytes)
+        if (cft->seq_start) {
-                max_bytes = sizeof(local_buffer) - 1;
+                return cft->seq_start(seq, ppos);
-        if (nbytes >= max_bytes)
+        } else {
-                return -E2BIG;
+                /*
-        /* Allocate a dynamic buffer if we need one */
+                 * The same behavior and code as single_open().  Returns
-        if (nbytes >= sizeof(local_buffer)) {
+                 * !NULL if pos is at the beginning; otherwise, NULL.
-                buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+                 */
-                if (buffer == NULL)
+                return NULL + !*ppos;
-                        return -ENOMEM;
-        }
-        if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
-                retval = -EFAULT;
-                goto out;
        }
-        buffer[nbytes] = 0;     /* nul-terminate */
-        retval = cft->write_string(css, cft, strstrip(buffer));
-        if (!retval)
-                retval = nbytes;
-out:
-        if (buffer != local_buffer)
-                kfree(buffer);
-        return retval;
 }
-static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
-                                 size_t nbytes, loff_t *ppos)
 {
-        struct cfent *cfe = __d_cfe(file->f_dentry);
+        struct cftype *cft = seq_cft(seq);
-        struct cftype *cft = __d_cft(file->f_dentry);
-        struct cgroup_subsys_state *css = cfe->css;
-        if (cft->write)
+        if (cft->seq_next) {
-                return cft->write(css, cft, file, buf, nbytes, ppos);
+                return cft->seq_next(seq, v, ppos);
-        if (cft->write_u64 || cft->write_s64)
+        } else {
-                return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
+                /*
-        if (cft->write_string)
+                 * The same behavior and code as single_open(), always
-                return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
+                 * terminate after the initial read.
-        if (cft->trigger) {
+                 */
-                int ret = cft->trigger(css, (unsigned int)cft->private);
+                ++*ppos;
-                return ret ? ret : nbytes;
+                return NULL;
        }
-        return -EINVAL;
 }
-static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
-                               struct cftype *cft, struct file *file,
-                               char __user *buf, size_t nbytes, loff_t *ppos)
 {
-        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
+        struct cftype *cft = seq_cft(seq);
-        u64 val = cft->read_u64(css, cft);
-        int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
-        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+        if (cft->seq_stop)
+                cft->seq_stop(seq, v);
 }
-static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
-                               struct cftype *cft, struct file *file,
-                               char __user *buf, size_t nbytes, loff_t *ppos)
 {
-        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
+        struct cftype *cft = seq_cft(m);
-        s64 val = cft->read_s64(css, cft);
+        struct cgroup_subsys_state *css = seq_css(m);
-        int len = sprintf(tmp, "%lld\n", (long long) val);
-        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+        if (cft->seq_show)
-}
+                return cft->seq_show(m, arg);
-static ssize_t cgroup_file_read(struct file *file, char __user *buf,
-                                size_t nbytes, loff_t *ppos)
-{
-        struct cfent *cfe = __d_cfe(file->f_dentry);
-        struct cftype *cft = __d_cft(file->f_dentry);
-        struct cgroup_subsys_state *css = cfe->css;
-        if (cft->read)
-                return cft->read(css, cft, file, buf, nbytes, ppos);
        if (cft->read_u64)
-                return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
+                seq_printf(m, "%llu\n", cft->read_u64(css, cft));
-        if (cft->read_s64)
+        else if (cft->read_s64)
-                return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
+                seq_printf(m, "%lld\n", cft->read_s64(css, cft));
-        return -EINVAL;
+        else
-}
+                return -EINVAL;
+        return 0;
-/*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
- */
-static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
-{
-        struct seq_file *sf = cb->state;
-        return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
-}
-static int cgroup_seqfile_show(struct seq_file *m, void *arg)
-{
-        struct cfent *cfe = m->private;
-        struct cftype *cft = cfe->type;
-        struct cgroup_subsys_state *css = cfe->css;
-        if (cft->read_map) {
-                struct cgroup_map_cb cb = {
-                        .fill = cgroup_map_add,
-                        .state = m,
-                };
-                return cft->read_map(css, cft, &cb);
-        }
-        return cft->read_seq_string(css, cft, m);
 }
-static const struct file_operations cgroup_seqfile_operations = {
+static struct seq_operations cgroup_seq_operations = {
-        .read = seq_read,
+        .start          = cgroup_seqfile_start,
-        .write = cgroup_file_write,
+        .next           = cgroup_seqfile_next,
-        .llseek = seq_lseek,
+        .stop           = cgroup_seqfile_stop,
-        .release = single_release,
+        .show           = cgroup_seqfile_show,
 };
 static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2430,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
        struct cgroup_subsys_state *css;
+        struct cgroup_open_file *of;
        int err;
        err = generic_file_open(inode, file);
@@ -2459,30 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
        WARN_ON_ONCE(cfe->css && cfe->css != css);
        cfe->css = css;
-        if (cft->read_map || cft->read_seq_string) {
+        of = __seq_open_private(file, &cgroup_seq_operations,
-                file->f_op = &cgroup_seqfile_operations;
+                                sizeof(struct cgroup_open_file));
-                err = single_open(file, cgroup_seqfile_show, cfe);
+        if (of) {
-        } else if (cft->open) {
+                of->cfe = cfe;
-                err = cft->open(inode, file);
+                return 0;
        }
-        if (css->ss && err)
+        if (css->ss)
                css_put(css);
-        return err;
+        return -ENOMEM;
 }
 static int cgroup_file_release(struct inode *inode, struct file *file)
 {
        struct cfent *cfe = __d_cfe(file->f_dentry);
-        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup_subsys_state *css = cfe->css;
-        int ret = 0;
-        if (cft->release)
-                ret = cft->release(inode, file);
        if (css->ss)
                css_put(css);
-        return ret;
+        return seq_release_private(inode, file);
 }
 /*
@@ -2593,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
 }
 static const struct file_operations cgroup_file_operations = {
-        .read = cgroup_file_read,
+        .read = seq_read,
        .write = cgroup_file_write,
        .llseek = generic_file_llseek,
        .open = cgroup_file_open,
@@ -2618,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .removexattr = cgroup_removexattr,
 };
-/*
- * Check if a file is a control file
- */
-static inline struct cftype *__file_cft(struct file *file)
-{
-        if (file_inode(file)->i_fop != &cgroup_file_operations)
-                return ERR_PTR(-EINVAL);
-        return __d_cft(file->f_dentry);
-}
 static int cgroup_create_file(struct dentry *dentry, umode_t mode,
                                struct super_block *sb)
 {
@@ -2685,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
        if (cft->mode)
                return cft->mode;
-        if (cft->read || cft->read_u64 || cft->read_s64 ||
+        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
-            cft->read_map || cft->read_seq_string)
                mode |= S_IRUGO;
-        if (cft->write || cft->write_u64 || cft->write_s64 ||
+        if (cft->write_u64 || cft->write_s64 || cft->write_string ||
-            cft->write_string || cft->trigger)
+            cft->trigger)
                mode |= S_IWUSR;
        return mode;
@@ -2986,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void)
 * @parent_css: css whose children to walk
 *
 * This function returns the next child of @parent_css and should be called
- * under RCU read lock.  The only requirement is that @parent_css and
+ * under either cgroup_mutex or RCU read lock.  The only requirement is
- * @pos_css are accessible.  The next sibling is guaranteed to be returned
+ * that @parent_css and @pos_css are accessible.  The next sibling is
- * regardless of their states.
+ * guaranteed to be returned regardless of their states.
 */
 struct cgroup_subsys_state *
 css_next_child(struct cgroup_subsys_state *pos_css,
@@ -2998,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
        struct cgroup *cgrp = parent_css->cgroup;
        struct cgroup *next;
-        WARN_ON_ONCE(!rcu_read_lock_held());
+        cgroup_assert_mutex_or_rcu_locked();
        /*
         * @pos could already have been removed.  Once a cgroup is removed,
@@ -3045,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
 *
- * While this function requires RCU read locking, it doesn't require the
+ * While this function requires cgroup_mutex or RCU read locking, it
- * whole traversal to be contained in a single RCU critical section.  This
+ * doesn't require the whole traversal to be contained in a single critical
- * function will return the correct next descendant as long as both @pos
+ * section.  This function will return the correct next descendant as long
- * and @root are accessible and @pos is a descendant of @root.
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
 */
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3056,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 {
        struct cgroup_subsys_state *next;
-        WARN_ON_ONCE(!rcu_read_lock_held());
+        cgroup_assert_mutex_or_rcu_locked();
        /* if first iteration, visit @root */
        if (!pos)
@@ -3087,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
 * is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
 *
- * While this function requires RCU read locking, it doesn't require the
+ * While this function requires cgroup_mutex or RCU read locking, it
- * whole traversal to be contained in a single RCU critical section.  This
+ * doesn't require the whole traversal to be contained in a single critical
- * function will return the correct rightmost descendant as long as @pos is
+ * section.  This function will return the correct rightmost descendant as
- * accessible.
+ * long as @pos is accessible.
 */
 struct cgroup_subsys_state *
 css_rightmost_descendant(struct cgroup_subsys_state *pos)
 {
        struct cgroup_subsys_state *last, *tmp;
-        WARN_ON_ONCE(!rcu_read_lock_held());
+        cgroup_assert_mutex_or_rcu_locked();
        do {
                last = pos;
@@ -3133,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
 *
- * While this function requires RCU read locking, it doesn't require the
+ * While this function requires cgroup_mutex or RCU read locking, it
- * whole traversal to be contained in a single RCU critical section.  This
+ * doesn't require the whole traversal to be contained in a single critical
- * function will return the correct next descendant as long as both @pos
+ * section.  This function will return the correct next descendant as long
- * and @cgroup are accessible and @pos is a descendant of @cgroup.
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
 */
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3144,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 {
        struct cgroup_subsys_state *next;
-        WARN_ON_ONCE(!rcu_read_lock_held());
+        cgroup_assert_mutex_or_rcu_locked();
        /* if first iteration, visit leftmost descendant which may be @root */
        if (!pos)
@@ -3483,14 +3423,12 @@ struct cgroup_pidlist {
        pid_t *list;
        /* how many elements the above list has */
        int length;
-        /* how many files are using the current array */
-        int use_count;
        /* each of these stored in a list by its cgroup */
        struct list_head links;
        /* pointer to the cgroup we belong to, for list removal purposes */
        struct cgroup *owner;
-        /* protects the other fields */
+        /* for delayed destruction */
-        struct rw_semaphore rwsem;
+        struct delayed_work destroy_dwork;
 };
 /*
@@ -3506,6 +3444,7 @@ static void *pidlist_allocate(int count)
        else
                return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
 }
 static void pidlist_free(void *p)
 {
        if (is_vmalloc_addr(p))
@@ -3515,6 +3454,47 @@ static void pidlist_free(void *p)
 }
 /*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+{
+        struct cgroup_pidlist *l, *tmp_l;
+        mutex_lock(&cgrp->pidlist_mutex);
+        list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+                mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+        mutex_unlock(&cgrp->pidlist_mutex);
+        flush_workqueue(cgroup_pidlist_destroy_wq);
+        BUG_ON(!list_empty(&cgrp->pidlists));
+}
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+        struct delayed_work *dwork = to_delayed_work(work);
+        struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+                                                destroy_dwork);
+        struct cgroup_pidlist *tofree = NULL;
+        mutex_lock(&l->owner->pidlist_mutex);
+        /*
+         * Destroy iff we didn't get queued again.  The state won't change
+         * as destroy_dwork can only be queued while locked.
+         */
+        if (!delayed_work_pending(dwork)) {
+                list_del(&l->links);
+                pidlist_free(l->list);
+                put_pid_ns(l->key.ns);
+                tofree = l;
+        }
+        mutex_unlock(&l->owner->pidlist_mutex);
+        kfree(tofree);
+}
+/*
 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
 * Returns the number of unique elements.
 */
@@ -3544,52 +3524,92 @@ after:
        return dest;
 }
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property.  In the long term, we
+ * want to do away with it.  Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+        unsigned a = pid & 0x55555555;
+        unsigned b = pid & 0xAAAAAAAA;
+        return (a << 1) | (b >> 1);
+}
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+        if (cgroup_sane_behavior(cgrp))
+                return pid_fry(pid);
+        else
+                return pid;
+}
 static int cmppid(const void *a, const void *b)
 {
        return *(pid_t *)a - *(pid_t *)b;
 }
+static int fried_cmppid(const void *a, const void *b)
+{
+        return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+                                                  enum cgroup_filetype type)
+{
+        struct cgroup_pidlist *l;
+        /* don't need task_nsproxy() if we're looking at ourself */
+        struct pid_namespace *ns = task_active_pid_ns(current);
+        lockdep_assert_held(&cgrp->pidlist_mutex);
+        list_for_each_entry(l, &cgrp->pidlists, links)
+                if (l->key.type == type && l->key.ns == ns)
+                        return l;
+        return NULL;
+}
 /*
 * find the appropriate pidlist for our purpose (given procs vs tasks)
 * returns with the lock on that pidlist already held, and takes care
 * of the use count, or returns NULL with no locks held if we're out of
 * memory.
 */
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
-                                                  enum cgroup_filetype type)
+                                                enum cgroup_filetype type)
 {
        struct cgroup_pidlist *l;
-        /* don't need task_nsproxy() if we're looking at ourself */
-        struct pid_namespace *ns = task_active_pid_ns(current);
-        /*
+        lockdep_assert_held(&cgrp->pidlist_mutex);
-         * We can't drop the pidlist_mutex before taking the l->rwsem in case
-         * the last ref-holder is trying to remove l from the list at the same
+        l = cgroup_pidlist_find(cgrp, type);
-         * time. Holding the pidlist_mutex precludes somebody taking whichever
+        if (l)
-         * list we find out from under us - compare release_pid_array().
+                return l;
-         */
-        mutex_lock(&cgrp->pidlist_mutex);
-        list_for_each_entry(l, &cgrp->pidlists, links) {
-                if (l->key.type == type && l->key.ns == ns) {
-                        /* make sure l doesn't vanish out from under us */
-                        down_write(&l->rwsem);
-                        mutex_unlock(&cgrp->pidlist_mutex);
-                        return l;
-                }
-        }
        /* entry not found; create a new one */
        l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-        if (!l) {
+        if (!l)
-                mutex_unlock(&cgrp->pidlist_mutex);
                return l;
-        }
-        init_rwsem(&l->rwsem);
+        INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-        down_write(&l->rwsem);
        l->key.type = type;
-        l->key.ns = get_pid_ns(ns);
+        /* don't need task_nsproxy() if we're looking at ourself */
+        l->key.ns = get_pid_ns(task_active_pid_ns(current));
        l->owner = cgrp;
        list_add(&l->links, &cgrp->pidlists);
-        mutex_unlock(&cgrp->pidlist_mutex);
        return l;
 }
@@ -3606,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        struct task_struct *tsk;
        struct cgroup_pidlist *l;
+        lockdep_assert_held(&cgrp->pidlist_mutex);
        /*
         * If cgroup gets more users after we read count, we won't have
         * enough space - tough.  This race is indistinguishable to the
@@ -3632,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        css_task_iter_end(&it);
        length = n;
        /* now sort & (if procs) strip out duplicates */
-        sort(array, length, sizeof(pid_t), cmppid, NULL);
+        if (cgroup_sane_behavior(cgrp))
+                sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+        else
+                sort(array, length, sizeof(pid_t), cmppid, NULL);
        if (type == CGROUP_FILE_PROCS)
                length = pidlist_uniq(array, length);
-        l = cgroup_pidlist_find(cgrp, type);
+        l = cgroup_pidlist_find_create(cgrp, type);
        if (!l) {
+                mutex_unlock(&cgrp->pidlist_mutex);
                pidlist_free(array);
                return -ENOMEM;
        }
-        /* store array, freeing old if necessary - lock already held */
+        /* store array, freeing old if necessary */
        pidlist_free(l->list);
        l->list = array;
        l->length = length;
-        l->use_count++;
-        up_write(&l->rwsem);
        *lp = l;
        return 0;
 }
@@ -3719,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
         * after a seek to the start). Use a binary-search to find the
         * next pid to display, if any
         */
-        struct cgroup_pidlist *l = s->private;
+        struct cgroup_open_file *of = s->private;
+        struct cgroup *cgrp = seq_css(s)->cgroup;
+        struct cgroup_pidlist *l;
+        enum cgroup_filetype type = seq_cft(s)->private;
        int index = 0, pid = *pos;
-        int *iter;
+        int *iter, ret;
+        mutex_lock(&cgrp->pidlist_mutex);
+        /*
+         * !NULL @of->priv indicates that this isn't the first start()
+         * after open.  If the matching pidlist is around, we can use that.
+         * Look for it.  Note that @of->priv can't be used directly.  It
+         * could already have been destroyed.
+         */
+        if (of->priv)
+                of->priv = cgroup_pidlist_find(cgrp, type);
+        /*
+         * Either this is the first start() after open or the matching
+         * pidlist has been destroyed inbetween.  Create a new one.
+         */
+        if (!of->priv) {
+                ret = pidlist_array_load(cgrp, type,
+                                         (struct cgroup_pidlist **)&of->priv);
+                if (ret)
+                        return ERR_PTR(ret);
+        }
+        l = of->priv;
-        down_read(&l->rwsem);
        if (pid) {
                int end = l->length;
                while (index < end) {
                        int mid = (index + end) / 2;
-                        if (l->list[mid] == pid) {
+                        if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
                                index = mid;
                                break;
-                        } else if (l->list[mid] <= pid)
+                        } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
                                index = mid + 1;
                        else
                                end = mid;
@@ -3743,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
                return NULL;
        /* Update the abstract position to be the actual pid that we found */
        iter = l->list + index;
-        *pos = *iter;
+        *pos = cgroup_pid_fry(cgrp, *iter);
        return iter;
 }
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
-        struct cgroup_pidlist *l = s->private;
+        struct cgroup_open_file *of = s->private;
-        up_read(&l->rwsem);
+        struct cgroup_pidlist *l = of->priv;
+        if (l)
+                mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+                                 CGROUP_PIDLIST_DESTROY_DELAY);
+        mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
 }
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
-        struct cgroup_pidlist *l = s->private;
+        struct cgroup_open_file *of = s->private;
+        struct cgroup_pidlist *l = of->priv;
        pid_t *p = v;
        pid_t *end = l->list + l->length;
        /*
@@ -3766,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
        if (p >= end) {
                return NULL;
        } else {
-                *pos = *p;
+                *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
                return p;
        }
 }
@@ -3787,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
        .show = cgroup_pidlist_show,
 };
-static void cgroup_release_pid_array(struct cgroup_pidlist *l)
-{
-        /*
-         * the case where we're the last user of this particular pidlist will
-         * have us remove it from the cgroup's list, which entails taking the
-         * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
-         * pidlist_mutex, we have to take pidlist_mutex first.
-         */
-        mutex_lock(&l->owner->pidlist_mutex);
-        down_write(&l->rwsem);
-        BUG_ON(!l->use_count);
-        if (!--l->use_count) {
-                /* we're the last user if refcount is 0; remove and free */
-                list_del(&l->links);
-                mutex_unlock(&l->owner->pidlist_mutex);
-                pidlist_free(l->list);
-                put_pid_ns(l->key.ns);
-                up_write(&l->rwsem);
-                kfree(l);
-                return;
-        }
-        mutex_unlock(&l->owner->pidlist_mutex);
-        up_write(&l->rwsem);
-}
-static int cgroup_pidlist_release(struct inode *inode, struct file *file)
-{
-        struct cgroup_pidlist *l;
-        if (!(file->f_mode & FMODE_READ))
-                return 0;
-        /*
-         * the seq_file will only be initialized if the file was opened for
-         * reading; hence we check if it's not null only in that case.
-         */
-        l = ((struct seq_file *)file->private_data)->private;
-        cgroup_release_pid_array(l);
-        return seq_release(inode, file);
-}
-static const struct file_operations cgroup_pidlist_operations = {
-        .read = seq_read,
-        .llseek = seq_lseek,
-        .write = cgroup_file_write,
-        .release = cgroup_pidlist_release,
-};
-/*
- * The following functions handle opens on a file that displays a pidlist
- * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
- * in the cgroup.
- */
-/* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
-{
-        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-        struct cgroup_pidlist *l;
-        int retval;
-        /* Nothing to do for write-only files */
-        if (!(file->f_mode & FMODE_READ))
-                return 0;
-        /* have the array populated */
-        retval = pidlist_array_load(cgrp, type, &l);
-        if (retval)
-                return retval;
-        /* configure file information */
-        file->f_op = &cgroup_pidlist_operations;
-        retval = seq_open(file, &cgroup_pidlist_seq_operations);
-        if (retval) {
-                cgroup_release_pid_array(l);
-                return retval;
-        }
-        ((struct seq_file *)file->private_data)->private = l;
-        return 0;
-}
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
-{
-        return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
-}
-static int cgroup_procs_open(struct inode *unused, struct file *file)
-{
-        return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
-}
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
                                         struct cftype *cft)
 {
@@ -3907,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp)
        deactivate_super(sb);
 }
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void cgroup_event_remove(struct work_struct *work)
-{
-        struct cgroup_event *event = container_of(work, struct cgroup_event,
-                        remove);
-        struct cgroup_subsys_state *css = event->css;
-        remove_wait_queue(event->wqh, &event->wait);
-        event->cft->unregister_event(css, event->cft, event->eventfd);
-        /* Notify userspace the event is going away. */
-        eventfd_signal(event->eventfd, 1);
-        eventfd_ctx_put(event->eventfd);
-        kfree(event);
-        css_put(css);
-}
-/*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
-                int sync, void *key)
-{
-        struct cgroup_event *event = container_of(wait,
-                        struct cgroup_event, wait);
-        struct cgroup *cgrp = event->css->cgroup;
-        unsigned long flags = (unsigned long)key;
-        if (flags & POLLHUP) {
-                /*
-                 * If the event has been detached at cgroup removal, we
-                 * can simply return knowing the other side will cleanup
-                 * for us.
-                 *
-                 * We can't race against event freeing since the other
-                 * side will require wqh->lock via remove_wait_queue(),
-                 * which we hold.
-                 */
-                spin_lock(&cgrp->event_list_lock);
-                if (!list_empty(&event->list)) {
-                        list_del_init(&event->list);
-                        /*
-                         * We are in atomic context, but cgroup_event_remove()
-                         * may sleep, so we have to call it in workqueue.
-                         */
-                        schedule_work(&event->remove);
-                }
-                spin_unlock(&cgrp->event_list_lock);
-        }
-        return 0;
-}
-static void cgroup_event_ptable_queue_proc(struct file *file,
-                wait_queue_head_t *wqh, poll_table *pt)
-{
-        struct cgroup_event *event = container_of(pt,
-                        struct cgroup_event, pt);
-        event->wqh = wqh;
-        add_wait_queue(wqh, &event->wait);
-}
-/*
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
-                                      struct cftype *cft, const char *buffer)
-{
-        struct cgroup *cgrp = dummy_css->cgroup;
-        struct cgroup_event *event;
-        struct cgroup_subsys_state *cfile_css;
-        unsigned int efd, cfd;
-        struct fd efile;
-        struct fd cfile;
-        char *endp;
-        int ret;
-        efd = simple_strtoul(buffer, &endp, 10);
-        if (*endp != ' ')
-                return -EINVAL;
-        buffer = endp + 1;
-        cfd = simple_strtoul(buffer, &endp, 10);
-        if ((*endp != ' ') && (*endp != '\0'))
-                return -EINVAL;
-        buffer = endp + 1;
-        event = kzalloc(sizeof(*event), GFP_KERNEL);
-        if (!event)
-                return -ENOMEM;
-        INIT_LIST_HEAD(&event->list);
-        init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
-        init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
-        INIT_WORK(&event->remove, cgroup_event_remove);
-        efile = fdget(efd);
-        if (!efile.file) {
-                ret = -EBADF;
-                goto out_kfree;
-        }
-        event->eventfd = eventfd_ctx_fileget(efile.file);
-        if (IS_ERR(event->eventfd)) {
-                ret = PTR_ERR(event->eventfd);
-                goto out_put_efile;
-        }
-        cfile = fdget(cfd);
-        if (!cfile.file) {
-                ret = -EBADF;
-                goto out_put_eventfd;
-        }
-        /* the process need read permission on control file */
-        /* AV: shouldn't we check that it's been opened for read instead? */
-        ret = inode_permission(file_inode(cfile.file), MAY_READ);
-        if (ret < 0)
-                goto out_put_cfile;
-        event->cft = __file_cft(cfile.file);
-        if (IS_ERR(event->cft)) {
-                ret = PTR_ERR(event->cft);
-                goto out_put_cfile;
-        }
-        if (!event->cft->ss) {
-                ret = -EBADF;
-                goto out_put_cfile;
-        }
-        /*
-         * Determine the css of @cfile, verify it belongs to the same
-         * cgroup as cgroup.event_control, and associate @event with it.
-         * Remaining events are automatically removed on cgroup destruction
-         * but the removal is asynchronous, so take an extra ref.
-         */
-        rcu_read_lock();
-        ret = -EINVAL;
-        event->css = cgroup_css(cgrp, event->cft->ss);
-        cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
-        if (event->css && event->css == cfile_css && css_tryget(event->css))
-                ret = 0;
-        rcu_read_unlock();
-        if (ret)
-                goto out_put_cfile;
-        if (!event->cft->register_event || !event->cft->unregister_event) {
-                ret = -EINVAL;
-                goto out_put_css;
-        }
-        ret = event->cft->register_event(event->css, event->cft,
-                        event->eventfd, buffer);
-        if (ret)
-                goto out_put_css;
-        efile.file->f_op->poll(efile.file, &event->pt);
-        spin_lock(&cgrp->event_list_lock);
-        list_add(&event->list, &cgrp->event_list);
-        spin_unlock(&cgrp->event_list_lock);
-        fdput(cfile);
-        fdput(efile);
-        return 0;
-out_put_css:
-        css_put(event->css);
-out_put_cfile:
-        fdput(cfile);
-out_put_eventfd:
-        eventfd_ctx_put(event->eventfd);
-out_put_efile:
-        fdput(efile);
-out_kfree:
-        kfree(event);
-        return ret;
-}
 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
                                      struct cftype *cft)
 {
@@ -4122,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 static struct cftype cgroup_base_files[] = {
        {
                .name = "cgroup.procs",
-                .open = cgroup_procs_open,
+                .seq_start = cgroup_pidlist_start,
+                .seq_next = cgroup_pidlist_next,
+                .seq_stop = cgroup_pidlist_stop,
+                .seq_show = cgroup_pidlist_show,
+                .private = CGROUP_FILE_PROCS,
                .write_u64 = cgroup_procs_write,
-                .release = cgroup_pidlist_release,
                .mode = S_IRUGO | S_IWUSR,
        },
        {
-                .name = "cgroup.event_control",
-                .write_string = cgroup_write_event_control,
-                .mode = S_IWUGO,
-        },
-        {
                .name = "cgroup.clone_children",
                .flags = CFTYPE_INSANE,
                .read_u64 = cgroup_clone_children_read,
@@ -4141,7 +3914,7 @@ static struct cftype cgroup_base_files[] = {
        {
                .name = "cgroup.sane_behavior",
                .flags = CFTYPE_ONLY_ON_ROOT,
-                .read_seq_string = cgroup_sane_behavior_show,
+                .seq_show = cgroup_sane_behavior_show,
        },
        /*
@@ -4152,9 +3925,12 @@ static struct cftype cgroup_base_files[] = {
        {
                .name = "tasks",
                .flags = CFTYPE_INSANE,         /* use "procs" instead */
-                .open = cgroup_tasks_open,
+                .seq_start = cgroup_pidlist_start,
+                .seq_next = cgroup_pidlist_next,
+                .seq_stop = cgroup_pidlist_stop,
+                .seq_show = cgroup_pidlist_show,
+                .private = CGROUP_FILE_TASKS,
                .write_u64 = cgroup_tasks_write,
-                .release = cgroup_pidlist_release,
                .mode = S_IRUGO | S_IWUSR,
        },
        {
@@ -4166,7 +3942,7 @@ static struct cftype cgroup_base_files[] = {
        {
                .name = "release_agent",
                .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
-                .read_seq_string = cgroup_release_agent_show,
+                .seq_show = cgroup_release_agent_show,
                .write_string = cgroup_release_agent_write,
                .max_write_len = PATH_MAX,
        },
@@ -4249,7 +4025,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
         * css_put().  dput() requires process context which we don't have.
         */
        INIT_WORK(&css->destroy_work, css_free_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
@@ -4257,6 +4033,7 @@ static void css_release(struct percpu_ref *ref)
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
+        rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
        call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
@@ -4311,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css)
        RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
 }
+/**
+ * create_css - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
+ *
+ * Create a new css associated with @cgrp - @ss pair.  On success, the new
+ * css is online and installed in @cgrp with all interface files created.
+ * Returns 0 on success, -errno on failure.
+ */
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+        struct cgroup *parent = cgrp->parent;
+        struct cgroup_subsys_state *css;
+        int err;
+        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
+        lockdep_assert_held(&cgroup_mutex);
+        css = ss->css_alloc(cgroup_css(parent, ss));
+        if (IS_ERR(css))
+                return PTR_ERR(css);
+        err = percpu_ref_init(&css->refcnt, css_release);
+        if (err)
+                goto err_free;
+        init_css(css, ss, cgrp);
+        err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
+        if (err)
+                goto err_free;
+        err = online_css(css);
+        if (err)
+                goto err_free;
+        dget(cgrp->dentry);
+        css_get(css->parent);
+        if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+            parent->parent) {
+                pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+                           current->comm, current->pid, ss->name);
+                if (!strcmp(ss->name, "memory"))
+                        pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+                ss->warned_broken_hierarchy = true;
+        }
+        return 0;
+err_free:
+        percpu_ref_cancel_init(&css->refcnt);
+        ss->css_free(css);
+        return err;
+}
 /*
 * cgroup_create - create a cgroup
 * @parent: cgroup that will be parent of the new cgroup
@@ -4322,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css)
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                             umode_t mode)
 {
-        struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
        struct cgroup *cgrp;
        struct cgroup_name *name;
        struct cgroupfs_root *root = parent->root;
-        int err = 0;
+        int ssid, err = 0;
        struct cgroup_subsys *ss;
        struct super_block *sb = root->sb;
@@ -4382,23 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
-        for_each_root_subsys(root, ss) {
-                struct cgroup_subsys_state *css;
-                css = ss->css_alloc(cgroup_css(parent, ss));
-                if (IS_ERR(css)) {
-                        err = PTR_ERR(css);
-                        goto err_free_all;
-                }
-                css_ar[ss->subsys_id] = css;
-                err = percpu_ref_init(&css->refcnt, css_release);
-                if (err)
-                        goto err_free_all;
-                init_css(css, ss, cgrp);
-        }
        /*
         * Create directory.  cgroup_create_file() returns with the new
         * directory locked on success so that it can be populated without
@@ -4406,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         */
        err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
        if (err < 0)
-                goto err_free_all;
+                goto err_unlock;
        lockdep_assert_held(&dentry->d_inode->i_mutex);
        cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4415,59 +4230,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
-        /* each css holds a ref to the cgroup's dentry and the parent css */
-        for_each_root_subsys(root, ss) {
-                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-                dget(dentry);
-                css_get(css->parent);
-        }
        /* hold a ref to the parent's dentry */
        dget(parent->dentry);
-        /* creation succeeded, notify subsystems */
+        /*
-        for_each_root_subsys(root, ss) {
+         * @cgrp is now fully operational.  If something fails after this
-                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
+         * point, it'll be released via the normal destruction path.
+         */
-                err = online_css(css);
-                if (err)
-                        goto err_destroy;
-                if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
-                    parent->parent) {
-                        pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
-                                   current->comm, current->pid, ss->name);
-                        if (!strcmp(ss->name, "memory"))
-                                pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
-                        ss->warned_broken_hierarchy = true;
-                }
-        }
        idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
        err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
        if (err)
                goto err_destroy;
-        err = cgroup_populate_dir(cgrp, root->subsys_mask);
+        /* let's create and online css's */
-        if (err)
+        for_each_subsys(ss, ssid) {
-                goto err_destroy;
+                if (root->subsys_mask & (1 << ssid)) {
+                        err = create_css(cgrp, ss);
+                        if (err)
+                                goto err_destroy;
+                }
+        }
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
        return 0;
-err_free_all:
+err_unlock:
-        for_each_root_subsys(root, ss) {
-                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-                if (css) {
-                        percpu_ref_cancel_init(&css->refcnt);
-                        ss->css_free(css);
-                }
-        }
        mutex_unlock(&cgroup_mutex);
        /* Release the reference count that we took on the superblock */
        deactivate_super(sb);
@@ -4539,7 +4329,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
                container_of(ref, struct cgroup_subsys_state, refcnt);
        INIT_WORK(&css->destroy_work, css_killed_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 /**
@@ -4602,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
        struct dentry *d = cgrp->dentry;
-        struct cgroup_event *event, *tmp;
+        struct cgroup_subsys_state *css;
-        struct cgroup_subsys *ss;
        struct cgroup *child;
        bool empty;
+        int ssid;
        lockdep_assert_held(&d->d_inode->i_mutex);
        lockdep_assert_held(&cgroup_mutex);
@@ -4641,8 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         * will be invoked to perform the rest of destruction once the
         * percpu refs of all css's are confirmed to be killed.
         */
-        for_each_root_subsys(cgrp->root, ss)
+        for_each_css(css, ssid, cgrp)
-                kill_css(cgroup_css(cgrp, ss));
+                kill_css(css);
        /*
         * Mark @cgrp dead.  This prevents further task migration and child
@@ -4677,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        dget(d);
        cgroup_d_remove_dir(d);
-        /*
-         * Unregister events and notify userspace.
-         * Notify userspace about cgroup removing only after rmdir of cgroup
-         * directory to avoid race between userspace and kernelspace.
-         */
-        spin_lock(&cgrp->event_list_lock);
-        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-                list_del_init(&event->list);
-                schedule_work(&event->remove);
-        }
-        spin_unlock(&cgrp->event_list_lock);
        return 0;
 };
@@ -4711,14 +4489,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
        /* delete this cgroup from parent->children */
        list_del_rcu(&cgrp->sibling);
-        /*
-         * We should remove the cgroup object from idr before its grace
-         * period starts, so we won't be looking up a cgroup while the
-         * cgroup is being freed.
-         */
-        idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-        cgrp->id = -1;
        dput(d);
        set_bit(CGRP_RELEASABLE, &parent->flags);
@@ -4767,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        cgroup_init_cftsets(ss);
        /* Create the top cgroup state for this subsystem */
-        list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
        ss->root = &cgroup_dummy_root;
        css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
        /* We don't handle early failures gracefully */
@@ -4841,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        cgroup_init_cftsets(ss);
        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
        cgroup_subsys[ss->subsys_id] = ss;
        /*
@@ -4852,11 +4622,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        if (IS_ERR(css)) {
                /* failure case - need to deassign the cgroup_subsys[] slot. */
                cgroup_subsys[ss->subsys_id] = NULL;
+                mutex_unlock(&cgroup_root_mutex);
                mutex_unlock(&cgroup_mutex);
                return PTR_ERR(css);
        }
-        list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
        ss->root = &cgroup_dummy_root;
        /* our new subsystem will be attached to the dummy hierarchy. */
@@ -4886,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        write_unlock(&css_set_lock);
        ret = online_css(css);
-        if (ret)
+        if (ret) {
+                ss->css_free(css);
                goto err_unload;
+        }
        /* success! */
+        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        return 0;
 err_unload:
+        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        /* @ss can't be mounted here as try_module_get() would fail */
        cgroup_unload_subsys(ss);
@@ -4912,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
        struct cgrp_cset_link *link;
+        struct cgroup_subsys_state *css;
        BUG_ON(ss->module == NULL);
@@ -4923,15 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        BUG_ON(ss->root != &cgroup_dummy_root);
        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
-        offline_css(cgroup_css(cgroup_dummy_top, ss));
+        css = cgroup_css(cgroup_dummy_top, ss);
+        if (css)
+                offline_css(css);
        /* deassign the subsys_id */
        cgroup_subsys[ss->subsys_id] = NULL;
-        /* remove subsystem from the dummy root's list of subsystems */
-        list_del_init(&ss->sibling);
        /*
         * disentangle the css from all css_sets attached to the dummy
         * top. as in loading, we need to pay our respects to the hashtable
@@ -4954,9 +4729,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
         * need to free before marking as null because ss->css_free needs
         * the cgrp->subsys pointer to find their state.
         */
-        ss->css_free(cgroup_css(cgroup_dummy_top, ss));
+        if (css)
+                ss->css_free(css);
        RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
+        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
 }
 EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
@@ -5063,6 +4840,31 @@ out:
        return err;
 }
+static int __init cgroup_wq_init(void)
+{
+        /*
+         * There isn't much point in executing destruction path in
+         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
+         * Use 1 for @max_active.
+         *
+         * We would prefer to do this in cgroup_init() above, but that
+         * is called before init_workqueues(): so leave this until after.
+         */
+        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+        BUG_ON(!cgroup_destroy_wq);
+        /*
+         * Used to destroy pidlists and separate to serve as flush domain.
+         * Cap @max_active to 1 too.
+         */
+        cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+                                                    0, 1);
+        BUG_ON(!cgroup_pidlist_destroy_wq);
+        return 0;
+}
+core_initcall(cgroup_wq_init);
 /*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -5102,11 +4904,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
        for_each_active_root(root) {
                struct cgroup_subsys *ss;
                struct cgroup *cgrp;
-                int count = 0;
+                int ssid, count = 0;
                seq_printf(m, "%d:", root->hierarchy_id);
-                for_each_root_subsys(root, ss)
+                for_each_subsys(ss, ssid)
-                        seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+                        if (root->subsys_mask & (1 << ssid))
+                                seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
@@ -5447,16 +5250,16 @@ __setup("cgroup_disable=", cgroup_disable);
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
 *
- * Must be called under RCU read lock.  The caller is responsible for
+ * Must be called under cgroup_mutex or RCU read lock.  The caller is
- * pinning the returned css if it needs to be accessed outside the RCU
+ * responsible for pinning the returned css if it needs to be accessed
- * critical section.
+ * outside the critical section.
 */
 struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
                                         struct cgroup_subsys *ss)
 {
        struct cgroup *cgrp;
-        WARN_ON_ONCE(!rcu_read_lock_held());
+        cgroup_assert_mutex_or_rcu_locked();
        /* is @dentry a cgroup dir? */
        if (!dentry->d_inode ||
@@ -5479,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
        struct cgroup *cgrp;
-        rcu_lockdep_assert(rcu_read_lock_held() ||
+        cgroup_assert_mutex_or_rcu_locked();
-                           lockdep_is_held(&cgroup_mutex),
-                           "css_from_id() needs proper protection");
        cgrp = idr_find(&ss->root->cgroup_idr, id);
        if (cgrp)
@@ -5529,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
        return count;
 }
-static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-                                         struct cftype *cft,
-                                         struct seq_file *seq)
 {
        struct cgrp_cset_link *link;
        struct css_set *cset;
@@ -5556,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
 }
 #define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup_subsys_state *css,
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
-                                 struct cftype *cft, struct seq_file *seq)
 {
+        struct cgroup_subsys_state *css = seq_css(seq);
        struct cgrp_cset_link *link;
        read_lock(&css_set_lock);
@@ -5604,12 +5403,12 @@ static struct cftype debug_files[] =  {
        {
                .name = "current_css_set_cg_links",
-                .read_seq_string = current_css_set_cg_links_read,
+                .seq_show = current_css_set_cg_links_read,
        },
        {
                .name = "cgroup_css_links",
-                .read_seq_string = cgroup_css_links_read,
+                .seq_show = cgroup_css_links_read,
        },
        {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f0ff64d0ebaa..6c3154e477f6 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -301,10 +301,9 @@ out_unlock:
        spin_unlock_irq(&freezer->lock);
 }
-static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
+static int freezer_read(struct seq_file *m, void *v)
-                        struct seq_file *m)
 {
-        struct cgroup_subsys_state *pos;
+        struct cgroup_subsys_state *css = seq_css(m), *pos;
        rcu_read_lock();
@@ -458,7 +457,7 @@ static struct cftype files[] = {
        {
                .name = "state",
                .flags = CFTYPE_NOT_ON_ROOT,
-                .read_seq_string = freezer_read,
+                .seq_show = freezer_read,
                .write_string = freezer_write,
        },
        {
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e5f3917aa05b..6cb20d2e7ee0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -53,10 +53,10 @@ void context_tracking_user_enter(void)
        /*
         * Repeat the user_enter() check here because some archs may be calling
         * this from asm and if no CPU needs context tracking, they shouldn't
-         * go further. Repeat the check here until they support the static key
+         * go further. Repeat the check here until they support the inline static
-         * check.
+         * key check.
         */
-        if (!static_key_false(&context_tracking_enabled))
+        if (!context_tracking_is_enabled())
                return;
        /*
@@ -160,7 +160,7 @@ void context_tracking_user_exit(void)
 {
        unsigned long flags;
-        if (!static_key_false(&context_tracking_enabled))
+        if (!context_tracking_is_enabled())
                return;
        if (in_interrupt())
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 988573a9a387..277f494c2a9a 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -105,14 +105,17 @@ static void cpu_idle_loop(void)
                                __current_set_polling();
                        }
                        arch_cpu_idle_exit();
-                        /*
-                         * We need to test and propagate the TIF_NEED_RESCHED
-                         * bit here because we might not have send the
-                         * reschedule IPI to idle tasks.
-                         */
-                        if (tif_need_resched())
-                                set_preempt_need_resched();
                }
+                /*
+                 * Since we fell out of the loop above, we know
+                 * TIF_NEED_RESCHED must be set, propagate it into
+                 * PREEMPT_NEED_RESCHED.
+                 *
+                 * This is required because for polling idle loops we will
+                 * not have had an IPI to fold the state for us.
+                 */
+                preempt_set_need_resched();
                tick_nohz_idle_exit();
                schedule_preempt_disabled();
        }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6bf981e13c43..4410ac6a55f1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        need_loop = task_has_mempolicy(tsk) ||
                        !nodes_intersects(*newmems, tsk->mems_allowed);
-        if (need_loop)
+        if (need_loop) {
+                local_irq_disable();
                write_seqcount_begin(&tsk->mems_allowed_seq);
+        }
        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
-        if (need_loop)
+        if (need_loop) {
                write_seqcount_end(&tsk->mems_allowed_seq);
+                local_irq_enable();
+        }
        task_unlock(tsk);
 }
@@ -1727,66 +1731,41 @@ out_unlock:
 * used, list of ranges of sequential numbers, is variable length,
 * and since these maps can change value dynamically, one could read
 * gibberish by doing partial reads while a list was changing.
- * A single large read to a buffer that crosses a page boundary is
- * ok, because the result being copied to user land is not recomputed
- * across a page fault.
 */
+static int cpuset_common_seq_show(struct seq_file *sf, void *v)
-static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
-        size_t count;
+        struct cpuset *cs = css_cs(seq_css(sf));
+        cpuset_filetype_t type = seq_cft(sf)->private;
-        mutex_lock(&callback_mutex);
+        ssize_t count;
-        count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+        char *buf, *s;
-        mutex_unlock(&callback_mutex);
+        int ret = 0;
-        return count;
+        count = seq_get_buf(sf, &buf);
-}
+        s = buf;
-static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
-{
-        size_t count;
        mutex_lock(&callback_mutex);
-        count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
-        mutex_unlock(&callback_mutex);
-        return count;
-}
-static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
-                                       struct cftype *cft, struct file *file,
-                                       char __user *buf, size_t nbytes,
-                                       loff_t *ppos)
-{
-        struct cpuset *cs = css_cs(css);
-        cpuset_filetype_t type = cft->private;
-        char *page;
-        ssize_t retval = 0;
-        char *s;
-        if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
-                return -ENOMEM;
-        s = page;
        switch (type) {
        case FILE_CPULIST:
-                s += cpuset_sprintf_cpulist(s, cs);
+                s += cpulist_scnprintf(s, count, cs->cpus_allowed);
                break;
        case FILE_MEMLIST:
-                s += cpuset_sprintf_memlist(s, cs);
+                s += nodelist_scnprintf(s, count, cs->mems_allowed);
                break;
        default:
-                retval = -EINVAL;
+                ret = -EINVAL;
-                goto out;
+                goto out_unlock;
        }
-        *s++ = '\n';
-        retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
+        if (s < buf + count - 1) {
-out:
+                *s++ = '\n';
-        free_page((unsigned long)page);
+                seq_commit(sf, s - buf);
-        return retval;
+        } else {
+                seq_commit(sf, -1);
+        }
+out_unlock:
+        mutex_unlock(&callback_mutex);
+        return ret;
 }
 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -1843,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 static struct cftype files[] = {
        {
                .name = "cpus",
-                .read = cpuset_common_file_read,
+                .seq_show = cpuset_common_seq_show,
                .write_string = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
@@ -1851,7 +1830,7 @@ static struct cftype files[] = {
        {
                .name = "mems",
-                .read = cpuset_common_file_read,
+                .seq_show = cpuset_common_seq_show,
                .write_string = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d724e7757cd1..56003c6edfd3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -119,7 +119,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
-                       PERF_FLAG_PID_CGROUP)
+                       PERF_FLAG_PID_CGROUP |\
+                       PERF_FLAG_FD_CLOEXEC)
 /*
 * branch priv levels that need permission checks
@@ -1396,6 +1397,8 @@ event_sched_out(struct perf_event *event,
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;
+        perf_pmu_disable(event->pmu);
        event->state = PERF_EVENT_STATE_INACTIVE;
        if (event->pending_disable) {
                event->pending_disable = 0;
@@ -1412,6 +1415,8 @@ event_sched_out(struct perf_event *event,
                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
+        perf_pmu_enable(event->pmu);
 }
 static void
@@ -1652,6 +1657,7 @@ event_sched_in(struct perf_event *event,
                 struct perf_event_context *ctx)
 {
        u64 tstamp = perf_event_time(event);
+        int ret = 0;
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
@@ -1674,10 +1680,13 @@ event_sched_in(struct perf_event *event,
         */
        smp_wmb();
+        perf_pmu_disable(event->pmu);
        if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
-                return -EAGAIN;
+                ret = -EAGAIN;
+                goto out;
        }
        event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1693,7 +1702,10 @@ event_sched_in(struct perf_event *event,
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
-        return 0;
+out:
+        perf_pmu_enable(event->pmu);
+        return ret;
 }
 static int
@@ -2743,6 +2755,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                if (!event_filter_match(event))
                        continue;
+                perf_pmu_disable(event->pmu);
                hwc = &event->hw;
                if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -2752,7 +2766,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                }
                if (!event->attr.freq || !event->attr.sample_freq)
-                        continue;
+                        goto next;
                /*
                 * stop the event and update event->count
@@ -2774,6 +2788,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                        perf_adjust_period(event, period, delta, false);
                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
+        next:
+                perf_pmu_enable(event->pmu);
        }
        perf_pmu_enable(ctx->pmu);
@@ -3527,7 +3543,7 @@ static void perf_event_for_each(struct perf_event *event,
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
        struct perf_event_context *ctx = event->ctx;
-        int ret = 0;
+        int ret = 0, active;
        u64 value;
        if (!is_sampling_event(event))
@@ -3551,6 +3567,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
                event->attr.sample_period = value;
                event->hw.sample_period = value;
        }
+        active = (event->state == PERF_EVENT_STATE_ACTIVE);
+        if (active) {
+                perf_pmu_disable(ctx->pmu);
+                event->pmu->stop(event, PERF_EF_UPDATE);
+        }
+        local64_set(&event->hw.period_left, 0);
+        if (active) {
+                event->pmu->start(event, PERF_EF_RELOAD);
+                perf_pmu_enable(ctx->pmu);
+        }
 unlock:
        raw_spin_unlock_irq(&ctx->lock);
@@ -5680,11 +5710,6 @@ static void swevent_hlist_put(struct perf_event *event)
 {
        int cpu;
-        if (event->cpu != -1) {
-                swevent_hlist_put_cpu(event, event->cpu);
-                return;
-        }
        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(event, cpu);
 }
@@ -5718,9 +5743,6 @@ static int swevent_hlist_get(struct perf_event *event)
        int err;
        int cpu, failed_cpu;
-        if (event->cpu != -1)
-                return swevent_hlist_get_cpu(event, event->cpu);
        get_online_cpus();
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(event, cpu);
@@ -6663,6 +6685,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->rb_entry);
+        INIT_LIST_HEAD(&event->active_entry);
+        INIT_HLIST_NODE(&event->hlist_entry);
        init_waitqueue_head(&event->waitq);
        init_irq_work(&event->pending, perf_pending_event);
@@ -6973,6 +6998,7 @@ SYSCALL_DEFINE5(perf_event_open,
        int event_fd;
        int move_group = 0;
        int err;
+        int f_flags = O_RDWR;
        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
@@ -7001,7 +7027,10 @@ SYSCALL_DEFINE5(perf_event_open,
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;
-        event_fd = get_unused_fd();
+        if (flags & PERF_FLAG_FD_CLOEXEC)
+                f_flags |= O_CLOEXEC;
+        event_fd = get_unused_fd_flags(f_flags);
        if (event_fd < 0)
                return event_fd;
@@ -7123,7 +7152,8 @@ SYSCALL_DEFINE5(perf_event_open,
                        goto err_context;
        }
-        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
+                                        f_flags);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
                goto err_context;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e8b168af135b..146a5792b1d2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -61,19 +61,20 @@ again:
         *
         *   kernel                             user
         *
-         *   READ ->data_tail                   READ ->data_head
+         *   if (LOAD ->data_tail) {            LOAD ->data_head
-         *   smp_mb()   (A)                     smp_rmb()       (C)
+         *                      (A)             smp_rmb()       (C)
-         *   WRITE $data                        READ $data
+         *      STORE $data                     LOAD $data
-         *   smp_wmb()  (B)                     smp_mb()        (D)
+         *      smp_wmb()       (B)             smp_mb()        (D)
-         *   STORE ->data_head                  WRITE ->data_tail
+         *      STORE ->data_head               STORE ->data_tail
+         *   }
         *
         * Where A pairs with D, and B pairs with C.
         *
-         * I don't think A needs to be a full barrier because we won't in fact
+         * In our case (A) is a control dependency that separates the load of
-         * write data until we see the store from userspace. So we simply don't
+         * the ->data_tail and the stores of $data. In case ->data_tail
-         * issue the data WRITE until we observe it. Be conservative for now.
+         * indicates there is no room in the buffer to store $data we do not.
         *
-         * OTOH, D needs to be a full barrier since it separates the data READ
+         * D needs to be a full barrier since it separates the data READ
         * from the tail WRITE.
         *
         * For B a WMB is sufficient since it separates two WRITEs, and for C
@@ -81,7 +82,7 @@ again:
         *
         * See perf_output_begin().
         */
-        smp_wmb();
+        smp_wmb(); /* B, matches C */
        rb->user_page->data_head = head;
        /*
@@ -144,17 +145,26 @@ int perf_output_begin(struct perf_output_handle *handle,
                if (!rb->overwrite &&
                    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
                        goto fail;
+                /*
+                 * The above forms a control dependency barrier separating the
+                 * @tail load above from the data stores below. Since the @tail
+                 * load is required to compute the branch to fail below.
+                 *
+                 * A, matches D; the full memory barrier userspace SHOULD issue
+                 * after reading the data and before storing the new tail
+                 * position.
+                 *
+                 * See perf_output_put_handle().
+                 */
                head += size;
        } while (local_cmpxchg(&rb->head, offset, head) != offset);
        /*
-         * Separate the userpage->tail read from the data stores below.
+         * We rely on the implied barrier() by local_cmpxchg() to ensure
-         * Matches the MB userspace SHOULD issue after reading the data
+         * none of the data stores below can be lifted up by the compiler.
-         * and before storing the new tail position.
-         *
-         * See perf_output_put_handle().
         */
-        smp_mb();
        if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
                local_add(rb->watermark, &rb->wakeup);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 24b7d6ca871b..307d87c0991a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -73,6 +73,17 @@ struct uprobe {
        struct inode            *inode;         /* Also hold a ref to inode */
        loff_t                  offset;
        unsigned long           flags;
+        /*
+         * The generic code assumes that it has two members of unknown type
+         * owned by the arch-specific code:
+         *
+         *      insn -  copy_insn() saves the original instruction here for
+         *              arch_uprobe_analyze_insn().
+         *
+         *      ixol -  potentially modified instruction to execute out of
+         *              line, copied to xol_area by xol_get_insn_slot().
+         */
        struct arch_uprobe      arch;
 };
@@ -86,6 +97,29 @@ struct return_instance {
 };
 /*
+ * Execute out of line area: anonymous executable mapping installed
+ * by the probed task to execute the copy of the original instruction
+ * mangled by set_swbp().
+ *
+ * On a breakpoint hit, thread contests for a slot.  It frees the
+ * slot after singlestep. Currently a fixed number of slots are
+ * allocated.
+ */
+struct xol_area {
+        wait_queue_head_t       wq;             /* if all slots are busy */
+        atomic_t                slot_count;     /* number of in-use slots */
+        unsigned long           *bitmap;        /* 0 = free slot */
+        struct page             *page;
+        /*
+         * We keep the vma's vm_start rather than a pointer to the vma
+         * itself.  The probed process or a naughty kernel module could make
+         * the vma go away, and we must handle that reasonably gracefully.
+         */
+        unsigned long           vaddr;          /* Page(s) of instruction slots */
+};
+/*
 * valid_vma: Verify if the specified vma is an executable vma
 * Relax restrictions while unregistering: vm_flags might have
 * changed after breakpoint was inserted.
@@ -330,7 +364,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
 int __weak
 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+        return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
 }
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -529,8 +563,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
 {
        struct address_space *mapping = uprobe->inode->i_mapping;
        loff_t offs = uprobe->offset;
-        void *insn = uprobe->arch.insn;
+        void *insn = &uprobe->arch.insn;
-        int size = MAX_UINSN_BYTES;
+        int size = sizeof(uprobe->arch.insn);
        int len, err = -EIO;
        /* Copy only available bytes, -EIO if nothing was read */
@@ -569,7 +603,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
                goto out;
        ret = -ENOTSUPP;
-        if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
+        if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
                goto out;
        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -1264,7 +1298,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
        /* Initialize the slot */
        copy_to_page(area->page, xol_vaddr,
-                        uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
+                        &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
        /*
         * We probably need flush_icache_user_range() but it needs vma.
         * This should work on supported architectures too.
@@ -1403,12 +1437,10 @@ static void uprobe_warn(struct task_struct *t, const char *msg)
 static void dup_xol_work(struct callback_head *work)
 {
-        kfree(work);
        if (current->flags & PF_EXITING)
                return;
-        if (!__create_xol_area(current->utask->vaddr))
+        if (!__create_xol_area(current->utask->dup_xol_addr))
                uprobe_warn(current, "dup xol area");
 }
@@ -1419,7 +1451,6 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
 {
        struct uprobe_task *utask = current->utask;
        struct mm_struct *mm = current->mm;
-        struct callback_head *work;
        struct xol_area *area;
        t->utask = NULL;
@@ -1441,14 +1472,9 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
        if (mm == t->mm)
                return;
-        /* TODO: move it into the union in uprobe_task */
+        t->utask->dup_xol_addr = area->vaddr;
-        work = kmalloc(sizeof(*work), GFP_KERNEL);
+        init_task_work(&t->utask->dup_xol_work, dup_xol_work);
-        if (!work)
+        task_work_add(t, &t->utask->dup_xol_work, true);
-                return uprobe_warn(t, "dup xol area");
-        t->utask->vaddr = area->vaddr;
-        init_task_work(work, dup_xol_work);
-        task_work_add(t, work, true);
 }
 /*
@@ -1828,6 +1854,10 @@ static void handle_swbp(struct pt_regs *regs)
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
                goto out;
+        /* Tracing handlers use ->utask to communicate with fetch methods */
+        if (!get_utask())
+                goto out;
        handler_chain(uprobe, regs);
        if (can_skip_sstep(uprobe, regs))
                goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index a949819055d5..1e77fc645317 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_group);
+        list_del_rcu(&p->thread_node);
 }
 /*
diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28105bb..763faf037ec1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 static inline int init_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_sinittext &&
-            addr <= (unsigned long)_einittext)
+            addr < (unsigned long)_einittext)
                return 1;
        return 0;
 }
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
 int core_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext &&
-            addr <= (unsigned long)_etext)
+            addr < (unsigned long)_etext)
                return 1;
        if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 728d5be9548c..2f11bbe376b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        spin_lock_init(&mm->page_table_lock);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        clear_tlb_flush_pending(mm);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -1034,6 +1035,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->nr_threads = 1;
        atomic_set(&sig->live, 1);
        atomic_set(&sig->sigcnt, 1);
+        /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
+        sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
+        tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
        init_waitqueue_head(&sig->wait_chldexit);
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
@@ -1086,8 +1092,10 @@ static void rt_mutex_init_task(struct task_struct *p)
 {
        raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init(&p->pi_waiters);
+        p->pi_waiters = RB_ROOT;
+        p->pi_waiters_leftmost = NULL;
        p->pi_blocked_on = NULL;
+        p->pi_top_task = NULL;
 #endif
 }
@@ -1171,7 +1179,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         * do not allow it to share a thread group or signal handlers or
         * parent with the forking task.
         */
-        if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
+        if (clone_flags & CLONE_SIGHAND) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) !=
                                current->nsproxy->pid_ns_for_children))
@@ -1310,7 +1318,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        /* Perform scheduler related setup. Assign this task to a CPU. */
-        sched_fork(clone_flags, p);
+        retval = sched_fork(clone_flags, p);
+        if (retval)
+                goto bad_fork_cleanup_policy;
        retval = perf_event_init_task(p);
        if (retval)
@@ -1402,13 +1412,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                p->tgid = p->pid;
        }
-        p->pdeath_signal = 0;
-        p->exit_state = 0;
        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;
+        p->pdeath_signal = 0;
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
@@ -1471,6 +1479,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        atomic_inc(&current->signal->sigcnt);
                        list_add_tail_rcu(&p->thread_group,
                                          &p->group_leader->thread_group);
+                        list_add_tail_rcu(&p->thread_node,
+                                          &p->signal->thread_head);
                }
                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b462fa197517..aa6a8aadb911 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
 bool pm_freezing;
 bool pm_nosig_freezing;
+/*
+ * Temporary export for the deadlock workaround in ata_scsi_hotplug().
+ * Remove once the hack becomes unnecessary.
+ */
+EXPORT_SYMBOL_GPL(pm_freezing);
 /* protects freezing and frozen transitions */
 static DEFINE_SPINLOCK(freezer_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index 80ba086f021d..44a1261cb9ff 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,14 +63,101 @@
 #include <linux/sched/rt.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
+#include <linux/bootmem.h>
 #include <asm/futex.h>
 #include "locking/rtmutex_common.h"
-int __read_mostly futex_cmpxchg_enabled;
+/*
+ * Basic futex operation and ordering guarantees:
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0                               CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ *   futex_wait(futex, val);
+ *   uval = *futex;
+ *                                     *futex = newval;
+ *                                     sys_futex(WAKE, futex);
+ *                                       futex_wake(futex);
+ *                                       if (queue_empty())
+ *                                         return;
+ *   if (uval == val)
+ *      lock(hash_bucket(futex));
+ *      queue();
+ *     unlock(hash_bucket(futex));
+ *     schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0                                 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ *   futex_wait(futex, val);
+ *
+ *   waiters++;
+ *   mb(); (A) <-- paired with -.
+ *                              |
+ *   lock(hash_bucket(futex));  |
+ *                              |
+ *   uval = *futex;             |
+ *                              |        *futex = newval;
+ *                              |        sys_futex(WAKE, futex);
+ *                              |          futex_wake(futex);
+ *                              |
+ *                              `------->  mb(); (B)
+ *   if (uval == val)
+ *     queue();
+ *     unlock(hash_bucket(futex));
+ *     schedule();                         if (waiters)
+ *                                           lock(hash_bucket(futex));
+ *                                           wake_waiters(futex);
+ *                                           unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read -- this
+ * is guaranteed by the head counter in the hb spinlock; and where (B)
+ * orders the write to futex and the waiters read -- this is done by the
+ * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
+ * depending on the futex type.
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ *      X = Y = 0
+ *
+ *      w[X]=1          w[Y]=1
+ *      MB              MB
+ *      r[Y]=y          r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ */
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+int __read_mostly futex_cmpxchg_enabled;
 /*
 * Futex flags used to encode options to functions and preserve them across
@@ -149,9 +236,41 @@ static const struct futex_q futex_q_init = {
 struct futex_hash_bucket {
        spinlock_t lock;
        struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;
+static unsigned long __read_mostly futex_hashsize;
+static struct futex_hash_bucket *futex_queues;
+static inline void futex_get_mm(union futex_key *key)
+{
+        atomic_inc(&key->private.mm->mm_count);
+        /*
+         * Ensure futex_get_mm() implies a full barrier such that
+         * get_futex_key() implies a full barrier. This is relied upon
+         * as full barrier (B), see the ordering comment above.
+         */
+        smp_mb__after_atomic_inc();
+}
+static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+        /*
+         * Tasks trying to enter the critical region are most likely
+         * potential waiters that will be added to the plist. Ensure
+         * that wakers won't miss to-be-slept tasks in the window between
+         * the wait call and the actual plist_add.
+         */
+        if (spin_is_locked(&hb->lock))
+                return true;
+        smp_rmb(); /* Make sure we check the lock state first */
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+        return !plist_head_empty(&hb->chain);
+#else
+        return true;
+#endif
+}
 /*
 * We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +280,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
        u32 hash = jhash2((u32*)&key->both.word,
                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
                          key->both.offset);
-        return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+        return &futex_queues[hash & (futex_hashsize - 1)];
 }
 /*
@@ -187,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key)
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
-                ihold(key->shared.inode);
+                ihold(key->shared.inode); /* implies MB (B) */
                break;
        case FUT_OFF_MMSHARED:
-                atomic_inc(&key->private.mm->mm_count);
+                futex_get_mm(key); /* implies MB (B) */
                break;
        }
 }
@@ -251,6 +370,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
                return -EINVAL;
        address -= key->both.offset;
+        if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+                return -EFAULT;
        /*
         * PROCESS_PRIVATE futexes are fast.
         * As the mm cannot disappear under us and the 'key' only needs
@@ -259,11 +381,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
-                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-                        return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
-                get_futex_key_refs(key);
+                get_futex_key_refs(key);  /* implies MB (B) */
                return 0;
        }
@@ -288,7 +408,7 @@ again:
                put_page(page);
                /* serialize against __split_huge_page_splitting() */
                local_irq_disable();
-                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
                        page_head = compound_head(page);
                        /*
                         * page_head is valid pointer but we must pin
@@ -370,7 +490,7 @@ again:
                key->shared.pgoff = basepage_index(page);
        }
-        get_futex_key_refs(key);
+        get_futex_key_refs(key); /* implies MB (B) */
 out:
        unlock_page(page_head);
@@ -597,13 +717,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 {
        struct futex_pi_state *pi_state = NULL;
        struct futex_q *this, *next;
-        struct plist_head *head;
        struct task_struct *p;
        pid_t pid = uval & FUTEX_TID_MASK;
-        head = &hb->chain;
+        plist_for_each_entry_safe(this, next, &hb->chain, list) {
-        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex(&this->key, key)) {
                        /*
                         * Another waiter already exists - bump up
@@ -985,7 +1102,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-        struct plist_head *head;
        union futex_key key = FUTEX_KEY_INIT;
        int ret;
@@ -997,10 +1113,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
                goto out;
        hb = hash_futex(&key);
+        /* Make sure we really have tasks to wakeup */
+        if (!hb_waiters_pending(hb))
+                goto out_put_key;
        spin_lock(&hb->lock);
-        head = &hb->chain;
-        plist_for_each_entry_safe(this, next, head, list) {
+        plist_for_each_entry_safe(this, next, &hb->chain, list) {
                if (match_futex (&this->key, &key)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
@@ -1018,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
        }
        spin_unlock(&hb->lock);
+out_put_key:
        put_futex_key(&key);
 out:
        return ret;
@@ -1033,7 +1154,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb1, *hb2;
-        struct plist_head *head;
        struct futex_q *this, *next;
        int ret, op_ret;
@@ -1081,9 +1201,7 @@ retry_private:
                goto retry;
        }
-        head = &hb1->chain;
+        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
-        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key1)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
@@ -1096,10 +1214,8 @@ retry_private:
        }
        if (op_ret > 0) {
-                head = &hb2->chain;
                op_ret = 0;
-                plist_for_each_entry_safe(this, next, head, list) {
+                plist_for_each_entry_safe(this, next, &hb2->chain, list) {
                        if (match_futex (&this->key, &key2)) {
                                if (this->pi_state || this->rt_waiter) {
                                        ret = -EINVAL;
@@ -1269,7 +1385,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        int drop_count = 0, task_count = 0, ret;
        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
-        struct plist_head *head1;
        struct futex_q *this, *next;
        u32 curval2;
@@ -1392,8 +1507,7 @@ retry_private:
                }
        }
-        head1 = &hb1->chain;
+        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
-        plist_for_each_entry_safe(this, next, head1, list) {
                if (task_count - nr_wake >= nr_requeue)
                        break;
@@ -1488,12 +1602,12 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
-        spin_lock(&hb->lock);
+        spin_lock(&hb->lock); /* implies MB (A) */
        return hb;
 }
 static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+queue_unlock(struct futex_hash_bucket *hb)
        __releases(&hb->lock)
 {
        spin_unlock(&hb->lock);
@@ -1866,7 +1980,7 @@ retry_private:
        ret = get_futex_value_locked(&uval, uaddr);
        if (ret) {
-                queue_unlock(q, *hb);
+                queue_unlock(*hb);
                ret = get_user(uval, uaddr);
                if (ret)
@@ -1880,7 +1994,7 @@ retry_private:
        }
        if (uval != val) {
-                queue_unlock(q, *hb);
+                queue_unlock(*hb);
                ret = -EWOULDBLOCK;
        }
@@ -2028,7 +2142,7 @@ retry_private:
                         * Task is exiting and we just wait for the
                         * exit to complete.
                         */
-                        queue_unlock(&q, hb);
+                        queue_unlock(hb);
                        put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
@@ -2080,7 +2194,7 @@ retry_private:
        goto out_put_key;
 out_unlock_put_key:
-        queue_unlock(&q, hb);
+        queue_unlock(hb);
 out_put_key:
        put_futex_key(&q.key);
@@ -2090,7 +2204,7 @@ out:
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 uaddr_faulted:
-        queue_unlock(&q, hb);
+        queue_unlock(hb);
        ret = fault_in_user_writeable(uaddr);
        if (ret)
@@ -2112,7 +2226,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-        struct plist_head *head;
        union futex_key key = FUTEX_KEY_INIT;
        u32 uval, vpid = task_pid_vnr(current);
        int ret;
@@ -2152,9 +2265,7 @@ retry:
         * Ok, other tasks may need to be woken up - check waiters
         * and do the wakeup if necessary:
         */
-        head = &hb->chain;
+        plist_for_each_entry_safe(this, next, &hb->chain, list) {
-        plist_for_each_entry_safe(this, next, head, list) {
                if (!match_futex (&this->key, &key))
                        continue;
                ret = wake_futex_pi(uaddr, uval, this);
@@ -2315,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * code while we sleep on uaddr.
         */
        debug_rt_mutex_init_waiter(&rt_waiter);
+        RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+        RB_CLEAR_NODE(&rt_waiter.tree_entry);
        rt_waiter.task = NULL;
        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2733,8 +2846,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 static int __init futex_init(void)
 {
        u32 curval;
-        int i;
+        unsigned int futex_shift;
+        unsigned long i;
+#if CONFIG_BASE_SMALL
+        futex_hashsize = 16;
+#else
+        futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+        futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+                                               futex_hashsize, 0,
+                                               futex_hashsize < 256 ? HASH_SMALL : 0,
+                                               &futex_shift, NULL,
+                                               futex_hashsize, futex_hashsize);
+        futex_hashsize = 1UL << futex_shift;
        /*
         * This will fail and we want it. Some arch implementations do
         * runtime detection of the futex_atomic_cmpxchg_inatomic()
@@ -2748,7 +2874,7 @@ static int __init futex_init(void)
        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
                futex_cmpxchg_enabled = 1;
-        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+        for (i = 0; i < futex_hashsize; i++) {
                plist_head_init(&futex_queues[i].chain);
                spin_lock_init(&futex_queues[i].lock);
        }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 383319bae3f7..09094361dce5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
@@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
        unsigned long slack;
        slack = current->timer_slack_ns;
-        if (rt_task(current))
+        if (dl_task(current) || rt_task(current))
                slack = 0;
        hrtimer_init_on_stack(&t.timer, clockid, mode);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf21760..abcd6ca86cb7 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
                bool is_early = desc->action &&
                        desc->action->flags & IRQF_EARLY_RESUME;
-                if (is_early != want_early)
+                if (!is_early && want_early)
                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 490afc03627e..9c970167e402 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
        .name  = "Crash kernel",
@@ -1675,7 +1678,9 @@ int kernel_kexec(void)
        } else
 #endif
        {
+                kexec_in_progress = true;
                kernel_restart_prepare(NULL);
+                migrate_to_reboot_cpu();
                printk(KERN_EMERG "Starting new kernel\n");
                machine_shutdown();
        }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 576ba756a32d..eb8a54783fa0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -590,6 +590,7 @@ static int very_verbose(struct lock_class *class)
 /*
 * Is this the address of a static object:
 */
+#ifdef __KERNEL__
 static int static_obj(void *obj)
 {
        unsigned long start = (unsigned long) &_stext,
@@ -616,6 +617,7 @@ static int static_obj(void *obj)
         */
        return is_module_address(addr) || is_module_percpu_address(addr);
 }
+#endif
 /*
 * To make lock name printouts unique, we calculate a unique
@@ -4115,6 +4117,7 @@ void debug_check_no_locks_held(void)
 }
 EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
+#ifdef __KERNEL__
 void debug_show_all_locks(void)
 {
        struct task_struct *g, *p;
@@ -4172,6 +4175,7 @@ retry:
                read_unlock(&tasklist_lock);
 }
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
+#endif
 /*
 * Careful: only use this function if you are sure that
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 7e3443fe1f48..faf6f5b53e77 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -75,7 +75,12 @@ void debug_mutex_unlock(struct mutex *lock)
                return;
        DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-        DEBUG_LOCKS_WARN_ON(lock->owner != current);
+        if (!lock->owner)
+                DEBUG_LOCKS_WARN_ON(!lock->owner);
+        else
+                DEBUG_LOCKS_WARN_ON(lock->owner != current);
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
        mutex_clear_owner(lock);
 }
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 13b243a323fa..49b2ed3dced8 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -24,7 +24,7 @@
 #include <linux/kallsyms.h>
 #include <linux/syscalls.h>
 #include <linux/interrupt.h>
-#include <linux/plist.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/debug_locks.h>
@@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-        DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+        DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
        DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
@@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 {
        memset(waiter, 0x11, sizeof(*waiter));
-        plist_node_init(&waiter->list_entry, MAX_PRIO);
-        plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
        waiter->deadlock_task_pid = NULL;
 }
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
        put_pid(waiter->deadlock_task_pid);
-        DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
-        DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
        memset(waiter, 0x22, sizeof(*waiter));
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 0dd6aec1cb6a..2e960a2bab81 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
 #include <linux/timer.h>
 #include "rtmutex_common.h"
@@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 }
 #endif
+static inline int
+rt_mutex_waiter_less(struct rt_mutex_waiter *left,
+                     struct rt_mutex_waiter *right)
+{
+        if (left->prio < right->prio)
+                return 1;
+        /*
+         * If both waiters have dl_prio(), we check the deadlines of the
+         * associated tasks.
+         * If left waiter has a dl_prio(), and we didn't return 1 above,
+         * then right waiter has a dl_prio() too.
+         */
+        if (dl_prio(left->prio))
+                return (left->task->dl.deadline < right->task->dl.deadline);
+        return 0;
+}
+static void
+rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+        struct rb_node **link = &lock->waiters.rb_node;
+        struct rb_node *parent = NULL;
+        struct rt_mutex_waiter *entry;
+        int leftmost = 1;
+        while (*link) {
+                parent = *link;
+                entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
+                if (rt_mutex_waiter_less(waiter, entry)) {
+                        link = &parent->rb_left;
+                } else {
+                        link = &parent->rb_right;
+                        leftmost = 0;
+                }
+        }
+        if (leftmost)
+                lock->waiters_leftmost = &waiter->tree_entry;
+        rb_link_node(&waiter->tree_entry, parent, link);
+        rb_insert_color(&waiter->tree_entry, &lock->waiters);
+}
+static void
+rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+        if (RB_EMPTY_NODE(&waiter->tree_entry))
+                return;
+        if (lock->waiters_leftmost == &waiter->tree_entry)
+                lock->waiters_leftmost = rb_next(&waiter->tree_entry);
+        rb_erase(&waiter->tree_entry, &lock->waiters);
+        RB_CLEAR_NODE(&waiter->tree_entry);
+}
+static void
+rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+        struct rb_node **link = &task->pi_waiters.rb_node;
+        struct rb_node *parent = NULL;
+        struct rt_mutex_waiter *entry;
+        int leftmost = 1;
+        while (*link) {
+                parent = *link;
+                entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
+                if (rt_mutex_waiter_less(waiter, entry)) {
+                        link = &parent->rb_left;
+                } else {
+                        link = &parent->rb_right;
+                        leftmost = 0;
+                }
+        }
+        if (leftmost)
+                task->pi_waiters_leftmost = &waiter->pi_tree_entry;
+        rb_link_node(&waiter->pi_tree_entry, parent, link);
+        rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+}
+static void
+rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+        if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+                return;
+        if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
+                task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
+        rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+        RB_CLEAR_NODE(&waiter->pi_tree_entry);
+}
 /*
- * Calculate task priority from the waiter list priority
+ * Calculate task priority from the waiter tree priority
 *
- * Return task->normal_prio when the waiter list is empty or when
+ * Return task->normal_prio when the waiter tree is empty or when
 * the waiter is not allowed to do priority boosting
 */
 int rt_mutex_getprio(struct task_struct *task)
@@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task)
        if (likely(!task_has_pi_waiters(task)))
                return task->normal_prio;
-        return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+        return min(task_top_pi_waiter(task)->prio,
                   task->normal_prio);
 }
+struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
+{
+        if (likely(!task_has_pi_waiters(task)))
+                return NULL;
+        return task_top_pi_waiter(task)->task;
+}
 /*
 * Adjust the priority of a task, after its pi_waiters got modified.
 *
@@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
 {
        int prio = rt_mutex_getprio(task);
-        if (task->prio != prio)
+        if (task->prio != prio || dl_prio(prio))
                rt_mutex_setprio(task, prio);
 }
@@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * When deadlock detection is off then we check, if further
         * priority adjustment is necessary.
         */
-        if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+        if (!detect_deadlock && waiter->prio == task->prio)
                goto out_unlock_pi;
        lock = waiter->lock;
@@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        top_waiter = rt_mutex_top_waiter(lock);
        /* Requeue the waiter */
-        plist_del(&waiter->list_entry, &lock->wait_list);
+        rt_mutex_dequeue(lock, waiter);
-        waiter->list_entry.prio = task->prio;
+        waiter->prio = task->prio;
-        plist_add(&waiter->list_entry, &lock->wait_list);
+        rt_mutex_enqueue(lock, waiter);
        /* Release the task */
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        if (waiter == rt_mutex_top_waiter(lock)) {
                /* Boost the owner */
-                plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+                rt_mutex_dequeue_pi(task, top_waiter);
-                waiter->pi_list_entry.prio = waiter->list_entry.prio;
+                rt_mutex_enqueue_pi(task, waiter);
-                plist_add(&waiter->pi_list_entry, &task->pi_waiters);
                __rt_mutex_adjust_prio(task);
        } else if (top_waiter == waiter) {
                /* Deboost the owner */
-                plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+                rt_mutex_dequeue_pi(task, waiter);
                waiter = rt_mutex_top_waiter(lock);
-                waiter->pi_list_entry.prio = waiter->list_entry.prio;
+                rt_mutex_enqueue_pi(task, waiter);
-                plist_add(&waiter->pi_list_entry, &task->pi_waiters);
                __rt_mutex_adjust_prio(task);
        }
@@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
         * 3) it is top waiter
         */
        if (rt_mutex_has_waiters(lock)) {
-                if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+                if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
                        if (!waiter || waiter != rt_mutex_top_waiter(lock))
                                return 0;
                }
@@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
                /* remove the queued waiter. */
                if (waiter) {
-                        plist_del(&waiter->list_entry, &lock->wait_list);
+                        rt_mutex_dequeue(lock, waiter);
                        task->pi_blocked_on = NULL;
                }
@@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
                 */
                if (rt_mutex_has_waiters(lock)) {
                        top = rt_mutex_top_waiter(lock);
-                        top->pi_list_entry.prio = top->list_entry.prio;
+                        rt_mutex_enqueue_pi(task, top);
-                        plist_add(&top->pi_list_entry, &task->pi_waiters);
                }
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        }
@@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        __rt_mutex_adjust_prio(task);
        waiter->task = task;
        waiter->lock = lock;
-        plist_node_init(&waiter->list_entry, task->prio);
+        waiter->prio = task->prio;
-        plist_node_init(&waiter->pi_list_entry, task->prio);
        /* Get the top priority waiter on the lock */
        if (rt_mutex_has_waiters(lock))
                top_waiter = rt_mutex_top_waiter(lock);
-        plist_add(&waiter->list_entry, &lock->wait_list);
+        rt_mutex_enqueue(lock, waiter);
        task->pi_blocked_on = waiter;
@@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        if (waiter == rt_mutex_top_waiter(lock)) {
                raw_spin_lock_irqsave(&owner->pi_lock, flags);
-                plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+                rt_mutex_dequeue_pi(owner, top_waiter);
-                plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+                rt_mutex_enqueue_pi(owner, waiter);
                __rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
@@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
         * boosted mode and go back to normal after releasing
         * lock->wait_lock.
         */
-        plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+        rt_mutex_dequeue_pi(current, waiter);
        rt_mutex_set_owner(lock, NULL);
@@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock,
        int chain_walk = 0;
        raw_spin_lock_irqsave(&current->pi_lock, flags);
-        plist_del(&waiter->list_entry, &lock->wait_list);
+        rt_mutex_dequeue(lock, waiter);
        current->pi_blocked_on = NULL;
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
@@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock,
                raw_spin_lock_irqsave(&owner->pi_lock, flags);
-                plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+                rt_mutex_dequeue_pi(owner, waiter);
                if (rt_mutex_has_waiters(lock)) {
                        struct rt_mutex_waiter *next;
                        next = rt_mutex_top_waiter(lock);
-                        plist_add(&next->pi_list_entry, &owner->pi_waiters);
+                        rt_mutex_enqueue_pi(owner, next);
                }
                __rt_mutex_adjust_prio(owner);
@@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock,
                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
-        WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
        if (!chain_walk)
                return;
@@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        waiter = task->pi_blocked_on;
-        if (!waiter || waiter->list_entry.prio == task->prio) {
+        if (!waiter || (waiter->prio == task->prio &&
+                        !dl_prio(task->prio))) {
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                return;
        }
@@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        int ret = 0;
        debug_rt_mutex_init_waiter(&waiter);
+        RB_CLEAR_NODE(&waiter.pi_tree_entry);
+        RB_CLEAR_NODE(&waiter.tree_entry);
        raw_spin_lock(&lock->wait_lock);
@@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
        lock->owner = NULL;
        raw_spin_lock_init(&lock->wait_lock);
-        plist_head_init(&lock->wait_list);
+        lock->waiters = RB_ROOT;
+        lock->waiters_leftmost = NULL;
        debug_rt_mutex_init(lock, name);
 }
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 53a66c85261b..7431a9c86f35 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock);
 * This is the control structure for tasks blocked on a rt_mutex,
 * which is allocated on the kernel stack on of the blocked task.
 *
- * @list_entry:         pi node to enqueue into the mutex waiters list
+ * @tree_entry:         pi node to enqueue into the mutex waiters tree
- * @pi_list_entry:      pi node to enqueue into the mutex owner waiters list
+ * @pi_tree_entry:      pi node to enqueue into the mutex owner waiters tree
 * @task:               task reference to the blocked task
 */
 struct rt_mutex_waiter {
-        struct plist_node       list_entry;
+        struct rb_node          tree_entry;
-        struct plist_node       pi_list_entry;
+        struct rb_node          pi_tree_entry;
        struct task_struct      *task;
        struct rt_mutex         *lock;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -54,14 +54,15 @@ struct rt_mutex_waiter {
        struct pid              *deadlock_task_pid;
        struct rt_mutex         *deadlock_lock;
 #endif
+        int prio;
 };
 /*
- * Various helpers to access the waiters-plist:
+ * Various helpers to access the waiters-tree:
 */
 static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
 {
-        return !plist_head_empty(&lock->wait_list);
+        return !RB_EMPTY_ROOT(&lock->waiters);
 }
 static inline struct rt_mutex_waiter *
@@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
 {
        struct rt_mutex_waiter *w;
-        w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+        w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
-                               list_entry);
+                     tree_entry);
        BUG_ON(w->lock != lock);
        return w;
@@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
 static inline int task_has_pi_waiters(struct task_struct *p)
 {
-        return !plist_head_empty(&p->pi_waiters);
+        return !RB_EMPTY_ROOT(&p->pi_waiters);
 }
 static inline struct rt_mutex_waiter *
 task_top_pi_waiter(struct task_struct *p)
 {
-        return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+        return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
-                                  pi_list_entry);
+                        pi_tree_entry);
 }
 /*
diff --git a/kernel/module.c b/kernel/module.c
index f5a3b1e8ec51..d24fcf29cb64 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -815,10 +815,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
-        if (!(flags & O_NONBLOCK)) {
+        if (!(flags & O_NONBLOCK))
-                printk(KERN_WARNING
+                pr_warn("waiting module removal not supported: please upgrade\n");
-                       "waiting module removal not supported: please upgrade");
-        }
        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
diff --git a/kernel/padata.c b/kernel/padata.c
index 07af2c95dcfe..161402f0b517 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 static int padata_cpu_hash(struct parallel_data *pd)
 {
+        unsigned int seq_nr;
        int cpu_index;
        /*
@@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd)
         * seq_nr mod. number of cpus in use.
         */
-        spin_lock(&pd->seq_lock);
+        seq_nr = atomic_inc_return(&pd->seq_nr);
-        cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+        cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
-        pd->seq_nr++;
-        spin_unlock(&pd->seq_lock);
        return padata_index_to_cpu(pd, cpu_index);
 }
@@ -113,7 +112,7 @@ int padata_do_parallel(struct padata_instance *pinst,
        rcu_read_lock_bh();
-        pd = rcu_dereference(pinst->pd);
+        pd = rcu_dereference_bh(pinst->pd);
        err = -EINVAL;
        if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
@@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
        padata_init_pqueues(pd);
        padata_init_squeues(pd);
        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
-        pd->seq_nr = 0;
+        atomic_set(&pd->seq_nr, -1);
        atomic_set(&pd->reorder_objects, 0);
        atomic_set(&pd->refcnt, 0);
        pd->pinst = pinst;
diff --git a/kernel/panic.c b/kernel/panic.c
index c00b4ceb39e8..6d6300375090 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,7 +33,7 @@ static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
-int panic_timeout;
+int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
diff --git a/kernel/params.c b/kernel/params.c
index c00d5b502aa4..b00142e7f3ba 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -227,17 +227,10 @@ int parse_args(const char *doing,
 }
 /* Lazy bastard, eh? */
-#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)       \
+#define STANDARD_PARAM_DEF(name, type, format, strtolfn)                \
        int param_set_##name(const char *val, const struct kernel_param *kp) \
        {                                                               \
-                tmptype l;                                              \
+                return strtolfn(val, 0, (type *)kp->arg);               \
-                int ret;                                                \
-                                                                        \
-                ret = strtolfn(val, 0, &l);                             \
-                if (ret < 0 || ((type)l != l))                          \
-                        return ret < 0 ? ret : -EINVAL;                 \
-                *((type *)kp->arg) = l;                                 \
-                return 0;                                               \
        }                                                               \
        int param_get_##name(char *buffer, const struct kernel_param *kp) \
        {                                                               \
@@ -253,13 +246,13 @@ int parse_args(const char *doing,
        EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c7f31aa272f7..3b8946416a5f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -233,7 +233,8 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 /*
 * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
+ * Must be called with task sighand lock held for safe while_each_thread()
+ * traversal.
 */
 static int cpu_clock_sample_group(const clockid_t which_clock,
                                  struct task_struct *p,
@@ -260,30 +261,53 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
        return 0;
 }
+static int posix_cpu_clock_get_task(struct task_struct *tsk,
+                                    const clockid_t which_clock,
+                                    struct timespec *tp)
+{
+        int err = -EINVAL;
+        unsigned long long rtn;
+        if (CPUCLOCK_PERTHREAD(which_clock)) {
+                if (same_thread_group(tsk, current))
+                        err = cpu_clock_sample(which_clock, tsk, &rtn);
+        } else {
+                unsigned long flags;
+                struct sighand_struct *sighand;
+                /*
+                 * while_each_thread() is not yet entirely RCU safe,
+                 * keep locking the group while sampling process
+                 * clock for now.
+                 */
+                sighand = lock_task_sighand(tsk, &flags);
+                if (!sighand)
+                        return err;
+                if (tsk == current || thread_group_leader(tsk))
+                        err = cpu_clock_sample_group(which_clock, tsk, &rtn);
+                unlock_task_sighand(tsk, &flags);
+        }
+        if (!err)
+                sample_to_timespec(which_clock, rtn, tp);
+        return err;
+}
 static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
        const pid_t pid = CPUCLOCK_PID(which_clock);
-        int error = -EINVAL;
+        int err = -EINVAL;
-        unsigned long long rtn;
        if (pid == 0) {
                /*
                 * Special case constant value for our own clocks.
                 * We don't have to do any lookup to find ourselves.
                 */
-                if (CPUCLOCK_PERTHREAD(which_clock)) {
+                err = posix_cpu_clock_get_task(current, which_clock, tp);
-                        /*
-                         * Sampling just ourselves we can do with no locking.
-                         */
-                        error = cpu_clock_sample(which_clock,
-                                                 current, &rtn);
-                } else {
-                        read_lock(&tasklist_lock);
-                        error = cpu_clock_sample_group(which_clock,
-                                                       current, &rtn);
-                        read_unlock(&tasklist_lock);
-                }
        } else {
                /*
                 * Find the given PID, and validate that the caller
@@ -292,29 +316,12 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
                struct task_struct *p;
                rcu_read_lock();
                p = find_task_by_vpid(pid);
-                if (p) {
+                if (p)
-                        if (CPUCLOCK_PERTHREAD(which_clock)) {
+                        err = posix_cpu_clock_get_task(p, which_clock, tp);
-                                if (same_thread_group(p, current)) {
-                                        error = cpu_clock_sample(which_clock,
-                                                                 p, &rtn);
-                                }
-                        } else {
-                                read_lock(&tasklist_lock);
-                                if (thread_group_leader(p) && p->sighand) {
-                                        error =
-                                            cpu_clock_sample_group(which_clock,
-                                                                   p, &rtn);
-                                }
-                                read_unlock(&tasklist_lock);
-                        }
-                }
                rcu_read_unlock();
        }
-        if (error)
+        return err;
-                return error;
-        sample_to_timespec(which_clock, rtn, tp);
-        return 0;
 }
@@ -371,36 +378,40 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
 */
 static int posix_cpu_timer_del(struct k_itimer *timer)
 {
-        struct task_struct *p = timer->it.cpu.task;
        int ret = 0;
+        unsigned long flags;
+        struct sighand_struct *sighand;
+        struct task_struct *p = timer->it.cpu.task;
-        if (likely(p != NULL)) {
+        WARN_ON_ONCE(p == NULL);
-                read_lock(&tasklist_lock);
-                if (unlikely(p->sighand == NULL)) {
-                        /*
-                         * We raced with the reaping of the task.
-                         * The deletion should have cleared us off the list.
-                         */
-                        BUG_ON(!list_empty(&timer->it.cpu.entry));
-                } else {
-                        spin_lock(&p->sighand->siglock);
-                        if (timer->it.cpu.firing)
-                                ret = TIMER_RETRY;
-                        else
-                                list_del(&timer->it.cpu.entry);
-                        spin_unlock(&p->sighand->siglock);
-                }
-                read_unlock(&tasklist_lock);
-                if (!ret)
+        /*
-                        put_task_struct(p);
+         * Protect against sighand release/switch in exit/exec and process/
+         * thread timer list entry concurrent read/writes.
+         */
+        sighand = lock_task_sighand(p, &flags);
+        if (unlikely(sighand == NULL)) {
+                /*
+                 * We raced with the reaping of the task.
+                 * The deletion should have cleared us off the list.
+                 */
+                WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
+        } else {
+                if (timer->it.cpu.firing)
+                        ret = TIMER_RETRY;
+                else
+                        list_del(&timer->it.cpu.entry);
+                unlock_task_sighand(p, &flags);
        }
+        if (!ret)
+                put_task_struct(p);
        return ret;
 }
-static void cleanup_timers_list(struct list_head *head,
+static void cleanup_timers_list(struct list_head *head)
-                                unsigned long long curr)
 {
        struct cpu_timer_list *timer, *next;
@@ -414,16 +425,11 @@ static void cleanup_timers_list(struct list_head *head,
 * time for later timer_gettime calls to return.
 * This must be called with the siglock held.
 */
-static void cleanup_timers(struct list_head *head,
+static void cleanup_timers(struct list_head *head)
-                           cputime_t utime, cputime_t stime,
-                           unsigned long long sum_exec_runtime)
 {
+        cleanup_timers_list(head);
-        cputime_t ptime = utime + stime;
+        cleanup_timers_list(++head);
+        cleanup_timers_list(++head);
-        cleanup_timers_list(head, cputime_to_expires(ptime));
-        cleanup_timers_list(++head, cputime_to_expires(utime));
-        cleanup_timers_list(++head, sum_exec_runtime);
 }
 /*
@@ -433,41 +439,14 @@ static void cleanup_timers(struct list_head *head,
 */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-        cputime_t utime, stime;
        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
                                                sizeof(unsigned long long));
-        task_cputime(tsk, &utime, &stime);
+        cleanup_timers(tsk->cpu_timers);
-        cleanup_timers(tsk->cpu_timers,
-                       utime, stime, tsk->se.sum_exec_runtime);
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-        struct signal_struct *const sig = tsk->signal;
+        cleanup_timers(tsk->signal->cpu_timers);
-        cputime_t utime, stime;
-        task_cputime(tsk, &utime, &stime);
-        cleanup_timers(tsk->signal->cpu_timers,
-                       utime + sig->utime, stime + sig->stime,
-                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
-}
-static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
-{
-        struct cpu_timer_list *timer = &itimer->it.cpu;
-        /*
-         * That's all for this thread or process.
-         * We leave our residual in expires to be reported.
-         */
-        put_task_struct(timer->task);
-        timer->task = NULL;
-        if (timer->expires < now) {
-                timer->expires = 0;
-        } else {
-                timer->expires -= now;
-        }
 }
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -477,8 +456,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 /*
 * Insert the timer on the appropriate list before any timers that
- * expire later.  This must be called with the tasklist_lock held
+ * expire later.  This must be called with the sighand lock held.
- * for reading, interrupts disabled and p->sighand->siglock taken.
 */
 static void arm_timer(struct k_itimer *timer)
 {
@@ -569,7 +547,8 @@ static void cpu_timer_fire(struct k_itimer *timer)
 /*
 * Sample a process (thread group) timer for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
+ * Must be called with task sighand lock held for safe while_each_thread()
+ * traversal.
 */
 static int cpu_timer_sample_group(const clockid_t which_clock,
                                  struct task_struct *p,
@@ -608,7 +587,8 @@ static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
 */
 static void posix_cpu_timer_kick_nohz(void)
 {
-        schedule_work(&nohz_kick_work);
+        if (context_tracking_is_enabled())
+                schedule_work(&nohz_kick_work);
 }
 bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
@@ -631,43 +611,39 @@ static inline void posix_cpu_timer_kick_nohz(void) { }
 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 * and try again.  (This happens when the timer is in the middle of firing.)
 */
-static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
                               struct itimerspec *new, struct itimerspec *old)
 {
+        unsigned long flags;
+        struct sighand_struct *sighand;
        struct task_struct *p = timer->it.cpu.task;
        unsigned long long old_expires, new_expires, old_incr, val;
        int ret;
-        if (unlikely(p == NULL)) {
+        WARN_ON_ONCE(p == NULL);
-                /*
-                 * Timer refers to a dead task's clock.
-                 */
-                return -ESRCH;
-        }
        new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
-        read_lock(&tasklist_lock);
        /*
-         * We need the tasklist_lock to protect against reaping that
+         * Protect against sighand release/switch in exit/exec and p->cpu_timers
-         * clears p->sighand.  If p has just been reaped, we can no
+         * and p->signal->cpu_timers read/write in arm_timer()
+         */
+        sighand = lock_task_sighand(p, &flags);
+        /*
+         * If p has just been reaped, we can no
         * longer get any information about it at all.
         */
-        if (unlikely(p->sighand == NULL)) {
+        if (unlikely(sighand == NULL)) {
-                read_unlock(&tasklist_lock);
-                put_task_struct(p);
-                timer->it.cpu.task = NULL;
                return -ESRCH;
        }
        /*
         * Disarm any old timer after extracting its expiry time.
         */
-        BUG_ON(!irqs_disabled());
+        WARN_ON_ONCE(!irqs_disabled());
        ret = 0;
        old_incr = timer->it.cpu.incr;
-        spin_lock(&p->sighand->siglock);
        old_expires = timer->it.cpu.expires;
        if (unlikely(timer->it.cpu.firing)) {
                timer->it.cpu.firing = -1;
@@ -724,12 +700,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                 * disable this firing since we are already reporting
                 * it as an overrun (thanks to bump_cpu_timer above).
                 */
-                spin_unlock(&p->sighand->siglock);
+                unlock_task_sighand(p, &flags);
-                read_unlock(&tasklist_lock);
                goto out;
        }
-        if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
+        if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
                new_expires += val;
        }
@@ -743,9 +718,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                arm_timer(timer);
        }
-        spin_unlock(&p->sighand->siglock);
+        unlock_task_sighand(p, &flags);
-        read_unlock(&tasklist_lock);
        /*
         * Install the new reload setting, and
         * set up the signal and overrun bookkeeping.
@@ -787,7 +760,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
        unsigned long long now;
        struct task_struct *p = timer->it.cpu.task;
-        int clear_dead;
+        WARN_ON_ONCE(p == NULL);
        /*
         * Easy part: convert the reload time.
@@ -800,52 +774,34 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
                return;
        }
-        if (unlikely(p == NULL)) {
-                /*
-                 * This task already died and the timer will never fire.
-                 * In this case, expires is actually the dead value.
-                 */
-        dead:
-                sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
-                                   &itp->it_value);
-                return;
-        }
        /*
         * Sample the clock to take the difference with the expiry time.
         */
        if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
                cpu_clock_sample(timer->it_clock, p, &now);
-                clear_dead = p->exit_state;
        } else {
-                read_lock(&tasklist_lock);
+                struct sighand_struct *sighand;
-                if (unlikely(p->sighand == NULL)) {
+                unsigned long flags;
+                /*
+                 * Protect against sighand release/switch in exit/exec and
+                 * also make timer sampling safe if it ends up calling
+                 * thread_group_cputime().
+                 */
+                sighand = lock_task_sighand(p, &flags);
+                if (unlikely(sighand == NULL)) {
                        /*
                         * The process has been reaped.
                         * We can't even collect a sample any more.
                         * Call the timer disarmed, nothing else to do.
                         */
-                        put_task_struct(p);
-                        timer->it.cpu.task = NULL;
                        timer->it.cpu.expires = 0;
-                        read_unlock(&tasklist_lock);
+                        sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
-                        goto dead;
+                                           &itp->it_value);
                } else {
                        cpu_timer_sample_group(timer->it_clock, p, &now);
-                        clear_dead = (unlikely(p->exit_state) &&
+                        unlock_task_sighand(p, &flags);
-                                      thread_group_empty(p));
                }
-                read_unlock(&tasklist_lock);
-        }
-        if (unlikely(clear_dead)) {
-                /*
-                 * We've noticed that the thread is dead, but
-                 * not yet reaped.  Take this opportunity to
-                 * drop our task ref.
-                 */
-                clear_dead_task(timer, now);
-                goto dead;
        }
        if (now < timer->it.cpu.expires) {
@@ -1059,14 +1015,12 @@ static void check_process_timers(struct task_struct *tsk,
 */
 void posix_cpu_timer_schedule(struct k_itimer *timer)
 {
+        struct sighand_struct *sighand;
+        unsigned long flags;
        struct task_struct *p = timer->it.cpu.task;
        unsigned long long now;
-        if (unlikely(p == NULL))
+        WARN_ON_ONCE(p == NULL);
-                /*
-                 * The task was cleaned up already, no future firings.
-                 */
-                goto out;
        /*
         * Fetch the current sample and update the timer's expiry time.
@@ -1074,49 +1028,45 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
        if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
                cpu_clock_sample(timer->it_clock, p, &now);
                bump_cpu_timer(timer, now);
-                if (unlikely(p->exit_state)) {
+                if (unlikely(p->exit_state))
-                        clear_dead_task(timer, now);
+                        goto out;
+                /* Protect timer list r/w in arm_timer() */
+                sighand = lock_task_sighand(p, &flags);
+                if (!sighand)
                        goto out;
-                }
-                read_lock(&tasklist_lock); /* arm_timer needs it.  */
-                spin_lock(&p->sighand->siglock);
        } else {
-                read_lock(&tasklist_lock);
+                /*
-                if (unlikely(p->sighand == NULL)) {
+                 * Protect arm_timer() and timer sampling in case of call to
+                 * thread_group_cputime().
+                 */
+                sighand = lock_task_sighand(p, &flags);
+                if (unlikely(sighand == NULL)) {
                        /*
                         * The process has been reaped.
                         * We can't even collect a sample any more.
                         */
-                        put_task_struct(p);
-                        timer->it.cpu.task = p = NULL;
                        timer->it.cpu.expires = 0;
-                        goto out_unlock;
+                        goto out;
                } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
-                        /*
+                        unlock_task_sighand(p, &flags);
-                         * We've noticed that the thread is dead, but
+                        /* Optimizations: if the process is dying, no need to rearm */
-                         * not yet reaped.  Take this opportunity to
+                        goto out;
-                         * drop our task ref.
-                         */
-                        cpu_timer_sample_group(timer->it_clock, p, &now);
-                        clear_dead_task(timer, now);
-                        goto out_unlock;
                }
-                spin_lock(&p->sighand->siglock);
                cpu_timer_sample_group(timer->it_clock, p, &now);
                bump_cpu_timer(timer, now);
-                /* Leave the tasklist_lock locked for the call below.  */
+                /* Leave the sighand locked for the call below.  */
        }
        /*
         * Now re-arm for the new expiry time.
         */
-        BUG_ON(!irqs_disabled());
+        WARN_ON_ONCE(!irqs_disabled());
        arm_timer(timer);
-        spin_unlock(&p->sighand->siglock);
+        unlock_task_sighand(p, &flags);
-out_unlock:
-        read_unlock(&tasklist_lock);
+        /* Kick full dynticks CPUs in case they need to tick on the new timer */
+        posix_cpu_timer_kick_nohz();
 out:
        timer->it_overrun_last = timer->it_overrun;
        timer->it_overrun = -1;
@@ -1200,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
        struct k_itimer *timer, *next;
        unsigned long flags;
-        BUG_ON(!irqs_disabled());
+        WARN_ON_ONCE(!irqs_disabled());
        /*
         * The fast path checks that there are no expired thread or thread
@@ -1256,13 +1206,6 @@ void run_posix_cpu_timers(struct task_struct *tsk)
                        cpu_timer_fire(timer);
                spin_unlock(&timer->it_lock);
        }
-        /*
-         * In case some timers were rescheduled after the queue got emptied,
-         * wake up full dynticks CPUs.
-         */
-        if (tsk->signal->cputimer.running)
-                posix_cpu_timer_kick_nohz();
 }
 /*
@@ -1274,7 +1217,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 {
        unsigned long long now;
-        BUG_ON(clock_idx == CPUCLOCK_SCHED);
+        WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
        cpu_timer_sample_group(clock_idx, tsk, &now);
        if (oldval) {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 463aa6736751..eacb8bd8cab4 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev)
        list_for_each_entry(tmp, &pm_vt_switch_list, head) {
                if (tmp->dev == dev) {
                        list_del(&tmp->head);
+                        kfree(tmp);
                        break;
                }
        }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b38109e204af..d9f61a145802 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
                BUG_ON(!region);
        } else
                /* This allocation cannot fail */
-                region = alloc_bootmem(sizeof(struct nosave_region));
+                region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
        region->start_pfn = start_pfn;
        region->end_pfn = end_pfn;
        list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index be7c86bae576..f8b41bddc6dc 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -757,14 +757,10 @@ void __init setup_log_buf(int early)
                return;
        if (early) {
-                unsigned long mem;
+                new_log_buf =
+                        memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
-                mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
-                if (!mem)
-                        return;
-                new_log_buf = __va(mem);
        } else {
-                new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
+                new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
        }
        if (unlikely(!new_log_buf)) {
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7859a0a3951e..79c3877e9c5b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -96,19 +96,22 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 }
 #endif  /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-extern void kfree(const void *);
+void kfree(const void *);
 static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 {
        unsigned long offset = (unsigned long)head->func;
+        rcu_lock_acquire(&rcu_callback_map);
        if (__is_kfree_rcu_offset(offset)) {
                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
                kfree((void *)head - offset);
+                rcu_lock_release(&rcu_callback_map);
                return 1;
        } else {
                RCU_TRACE(trace_rcu_invoke_callback(rn, head));
                head->func(head);
+                rcu_lock_release(&rcu_callback_map);
                return 0;
        }
 }
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..3318d8284384 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -363,6 +363,29 @@ static void srcu_flip(struct srcu_struct *sp)
 /*
 * Enqueue an SRCU callback on the specified srcu_struct structure,
 * initiating grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section.  On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu().  It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
 */
 void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
                void (*func)(struct rcu_head *head))
@@ -459,7 +482,30 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 * Note that it is illegal to call synchronize_srcu() from the corresponding
 * SRCU read-side critical section; doing so will result in deadlock.
 * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section.
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu().  In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section.  Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu().  This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
 */
 void synchronize_srcu(struct srcu_struct *sp)
 {
@@ -476,12 +522,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
 * Wait for an SRCU grace period to elapse, but be more aggressive about
 * spinning rather than blocking when waiting.
 *
- * Note that it is also illegal to call synchronize_srcu_expedited()
+ * Note that synchronize_srcu_expedited() has the same deadlock and
- * from the corresponding SRCU read-side critical section;
+ * memory-ordering properties as does synchronize_srcu().
- * doing so will result in deadlock.  However, it is perfectly legal
- * to call synchronize_srcu_expedited() on one srcu_struct from some
- * other srcu_struct's read-side critical section, as long as
- * the resulting graph of srcu_structs is acyclic.
 */
 void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
@@ -491,6 +533,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 /**
 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
 */
 void srcu_barrier(struct srcu_struct *sp)
 {
diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index 3929cd451511..732f8ae3086a 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -139,8 +139,6 @@ MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
 #define VERBOSE_PRINTK_ERRSTRING(s) \
        do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
-static char printk_buf[4096];
 static int nrealreaders;
 static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
@@ -376,7 +374,7 @@ struct rcu_torture_ops {
        void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
        void (*cb_barrier)(void);
        void (*fqs)(void);
-        int (*stats)(char *page);
+        void (*stats)(char *page);
        int irq_capable;
        int can_boost;
        const char *name;
@@ -578,21 +576,19 @@ static void srcu_torture_barrier(void)
        srcu_barrier(&srcu_ctl);
 }
-static int srcu_torture_stats(char *page)
+static void srcu_torture_stats(char *page)
 {
-        int cnt = 0;
        int cpu;
        int idx = srcu_ctl.completed & 0x1;
-        cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
+        page += sprintf(page, "%s%s per-CPU(idx=%d):",
                       torture_type, TORTURE_FLAG, idx);
        for_each_possible_cpu(cpu) {
-                cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
+                page += sprintf(page, " %d(%lu,%lu)", cpu,
                               per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
                               per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
        }
-        cnt += sprintf(&page[cnt], "\n");
+        sprintf(page, "\n");
-        return cnt;
 }
 static void srcu_torture_synchronize_expedited(void)
@@ -1052,10 +1048,9 @@ rcu_torture_reader(void *arg)
 /*
 * Create an RCU-torture statistics message in the specified buffer.
 */
-static int
+static void
 rcu_torture_printk(char *page)
 {
-        int cnt = 0;
        int cpu;
        int i;
        long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
@@ -1071,8 +1066,8 @@ rcu_torture_printk(char *page)
                if (pipesummary[i] != 0)
                        break;
        }
-        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
+        page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
-        cnt += sprintf(&page[cnt],
+        page += sprintf(page,
                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
                       rcu_torture_current,
                       rcu_torture_current_version,
@@ -1080,53 +1075,52 @@ rcu_torture_printk(char *page)
                       atomic_read(&n_rcu_torture_alloc),
                       atomic_read(&n_rcu_torture_alloc_fail),
                       atomic_read(&n_rcu_torture_free));
-        cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
+        page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
                       atomic_read(&n_rcu_torture_mberror),
                       n_rcu_torture_boost_ktrerror,
                       n_rcu_torture_boost_rterror);
-        cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
+        page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
                       n_rcu_torture_timers);
-        cnt += sprintf(&page[cnt],
+        page += sprintf(page,
                       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
                       n_online_successes, n_online_attempts,
                       n_offline_successes, n_offline_attempts,
                       min_online, max_online,
                       min_offline, max_offline,
                       sum_online, sum_offline, HZ);
-        cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
+        page += sprintf(page, "barrier: %ld/%ld:%ld",
                       n_barrier_successes,
                       n_barrier_attempts,
                       n_rcu_torture_barrier_error);
-        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
+        page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
            n_rcu_torture_barrier_error != 0 ||
            n_rcu_torture_boost_ktrerror != 0 ||
            n_rcu_torture_boost_rterror != 0 ||
            n_rcu_torture_boost_failure != 0 ||
            i > 1) {
-                cnt += sprintf(&page[cnt], "!!! ");
+                page += sprintf(page, "!!! ");
                atomic_inc(&n_rcu_torture_error);
                WARN_ON_ONCE(1);
        }
-        cnt += sprintf(&page[cnt], "Reader Pipe: ");
+        page += sprintf(page, "Reader Pipe: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-                cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
+                page += sprintf(page, " %ld", pipesummary[i]);
-        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
+        page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
-        cnt += sprintf(&page[cnt], "Reader Batch: ");
+        page += sprintf(page, "Reader Batch: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-                cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
+                page += sprintf(page, " %ld", batchsummary[i]);
-        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
+        page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
-        cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+        page += sprintf(page, "Free-Block Circulation: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
-                cnt += sprintf(&page[cnt], " %d",
+                page += sprintf(page, " %d",
                               atomic_read(&rcu_torture_wcount[i]));
        }
-        cnt += sprintf(&page[cnt], "\n");
+        page += sprintf(page, "\n");
        if (cur_ops->stats)
-                cnt += cur_ops->stats(&page[cnt]);
+                cur_ops->stats(page);
-        return cnt;
 }
 /*
@@ -1140,10 +1134,17 @@ rcu_torture_printk(char *page)
 static void
 rcu_torture_stats_print(void)
 {
-        int cnt;
+        int size = nr_cpu_ids * 200 + 8192;
+        char *buf;
-        cnt = rcu_torture_printk(printk_buf);
+        buf = kmalloc(size, GFP_KERNEL);
-        pr_alert("%s", printk_buf);
+        if (!buf) {
+                pr_err("rcu-torture: Out of memory, need: %d", size);
+                return;
+        }
+        rcu_torture_printk(buf);
+        pr_alert("%s", buf);
+        kfree(buf);
 }
 /*
@@ -1578,6 +1579,7 @@ static int rcu_torture_barrier_cbs(void *arg)
 {
        long myid = (long)arg;
        bool lastphase = 0;
+        bool newphase;
        struct rcu_head rcu;
        init_rcu_head_on_stack(&rcu);
@@ -1585,10 +1587,11 @@ static int rcu_torture_barrier_cbs(void *arg)
        set_user_nice(current, 19);
        do {
                wait_event(barrier_cbs_wq[myid],
-                           barrier_phase != lastphase ||
+                           (newphase =
+                            ACCESS_ONCE(barrier_phase)) != lastphase ||
                           kthread_should_stop() ||
                           fullstop != FULLSTOP_DONTSTOP);
-                lastphase = barrier_phase;
+                lastphase = newphase;
                smp_mb(); /* ensure barrier_phase load before ->call(). */
                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
                        break;
@@ -1625,7 +1628,7 @@ static int rcu_torture_barrier(void *arg)
                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
                        break;
                n_barrier_attempts++;
-                cur_ops->cb_barrier();
+                cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
                if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
                        n_rcu_torture_barrier_error++;
                        WARN_ON_ONCE(1);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index dd081987a8ec..b3d116cd072d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
                                bool user)
 {
+        struct rcu_state *rsp;
+        struct rcu_data *rdp;
        trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
        if (!user && !is_idle_task(current)) {
                struct task_struct *idle __maybe_unused =
@@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
                          current->pid, current->comm,
                          idle->pid, idle->comm); /* must be idle task! */
        }
+        for_each_rcu_flavor(rsp) {
+                rdp = this_cpu_ptr(rsp->rda);
+                do_nocb_deferred_wakeup(rdp);
+        }
        rcu_prepare_for_idle(smp_processor_id());
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
        smp_mb__before_atomic_inc();  /* See above. */
@@ -411,11 +418,12 @@ static void rcu_eqs_enter(bool user)
        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
-        if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
+        if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
                rdtp->dynticks_nesting = 0;
-        else
+                rcu_eqs_enter_common(rdtp, oldval, user);
+        } else {
                rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
-        rcu_eqs_enter_common(rdtp, oldval, user);
+        }
 }
 /**
@@ -533,11 +541,12 @@ static void rcu_eqs_exit(bool user)
        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        WARN_ON_ONCE(oldval < 0);
-        if (oldval & DYNTICK_TASK_NEST_MASK)
+        if (oldval & DYNTICK_TASK_NEST_MASK) {
                rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
-        else
+        } else {
                rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-        rcu_eqs_exit_common(rdtp, oldval, user);
+                rcu_eqs_exit_common(rdtp, oldval, user);
+        }
 }
 /**
@@ -716,7 +725,7 @@ bool rcu_lockdep_current_cpu_online(void)
        bool ret;
        if (in_nmi())
-                return 1;
+                return true;
        preempt_disable();
        rdp = this_cpu_ptr(&rcu_sched_data);
        rnp = rdp->mynode;
@@ -755,6 +764,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 }
 /*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+/*
 * Return true if the specified CPU has passed through a quiescent
 * state by virtue of being in or having passed through an dynticks
 * idle state since the last call to dyntick_save_progress_counter()
@@ -812,16 +827,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
         */
        rcu_kick_nohz_cpu(rdp->cpu);
+        /*
+         * Alternatively, the CPU might be running in the kernel
+         * for an extended period of time without a quiescent state.
+         * Attempt to force the CPU through the scheduler to gain the
+         * needed quiescent state, but only if the grace period has gone
+         * on for an uncommonly long time.  If there are many stuck CPUs,
+         * we will beat on the first one until it gets unstuck, then move
+         * to the next.  Only do this for the primary flavor of RCU.
+         */
+        if (rdp->rsp == rcu_state &&
+            ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) {
+                rdp->rsp->jiffies_resched += 5;
+                resched_cpu(rdp->cpu);
+        }
        return 0;
 }
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
        unsigned long j = ACCESS_ONCE(jiffies);
+        unsigned long j1;
        rsp->gp_start = j;
        smp_wmb(); /* Record start time before stall time. */
-        rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
+        j1 = rcu_jiffies_till_stall_check();
+        rsp->jiffies_stall = j + j1;
+        rsp->jiffies_resched = j + j1 / 2;
 }
 /*
@@ -1133,8 +1166,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
         * hold it, acquire the root rcu_node structure's lock in order to
         * start one (if needed).
         */
-        if (rnp != rnp_root)
+        if (rnp != rnp_root) {
                raw_spin_lock(&rnp_root->lock);
+                smp_mb__after_unlock_lock();
+        }
        /*
         * Get a new grace-period number.  If there really is no grace
@@ -1354,6 +1389,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
                local_irq_restore(flags);
                return;
        }
+        smp_mb__after_unlock_lock();
        __note_gp_changes(rsp, rnp, rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -1368,6 +1404,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
+        smp_mb__after_unlock_lock();
        if (rsp->gp_flags == 0) {
                /* Spurious wakeup, tell caller to go back to sleep.  */
                raw_spin_unlock_irq(&rnp->lock);
@@ -1409,6 +1446,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
                raw_spin_lock_irq(&rnp->lock);
+                smp_mb__after_unlock_lock();
                rdp = this_cpu_ptr(rsp->rda);
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
@@ -1463,6 +1501,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
        /* Clear flag to prevent immediate re-entry. */
        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
                raw_spin_lock_irq(&rnp->lock);
+                smp_mb__after_unlock_lock();
                rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
                raw_spin_unlock_irq(&rnp->lock);
        }
@@ -1480,6 +1519,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        struct rcu_node *rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
+        smp_mb__after_unlock_lock();
        gp_duration = jiffies - rsp->gp_start;
        if (gp_duration > rsp->gp_max)
                rsp->gp_max = gp_duration;
@@ -1505,16 +1545,19 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
                raw_spin_lock_irq(&rnp->lock);
+                smp_mb__after_unlock_lock();
                ACCESS_ONCE(rnp->completed) = rsp->gpnum;
                rdp = this_cpu_ptr(rsp->rda);
                if (rnp == rdp->mynode)
                        __note_gp_changes(rsp, rnp, rdp);
+                /* smp_mb() provided by prior unlock-lock pair. */
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched();
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
+        smp_mb__after_unlock_lock();
        rcu_nocb_gp_set(rnp, nocb);
        rsp->completed = rsp->gpnum; /* Declare grace period done. */
@@ -1553,6 +1596,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        wait_event_interruptible(rsp->gp_wq,
                                                 ACCESS_ONCE(rsp->gp_flags) &
                                                 RCU_GP_FLAG_INIT);
+                        /* Locking provides needed memory barrier. */
                        if (rcu_gp_init(rsp))
                                break;
                        cond_resched();
@@ -1582,6 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                        (!ACCESS_ONCE(rnp->qsmask) &&
                                         !rcu_preempt_blocked_readers_cgp(rnp)),
                                        j);
+                        /* Locking provides needed memory barriers. */
                        /* If grace period done, leave loop. */
                        if (!ACCESS_ONCE(rnp->qsmask) &&
                            !rcu_preempt_blocked_readers_cgp(rnp))
@@ -1749,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                rnp_c = rnp;
                rnp = rnp->parent;
                raw_spin_lock_irqsave(&rnp->lock, flags);
+                smp_mb__after_unlock_lock();
                WARN_ON_ONCE(rnp_c->qsmask);
        }
@@ -1778,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
        if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
            rnp->completed == rnp->gpnum) {
@@ -1901,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 * Adopt the RCU callbacks from the specified rcu_state structure's
 * orphanage.  The caller must hold the ->orphan_lock.
 */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
 {
        int i;
        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
        /* No-CBs CPUs are handled specially. */
-        if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
+        if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
                return;
        /* Do the accounting first. */
@@ -1986,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
-        rcu_adopt_orphan_cbs(rsp);
+        rcu_adopt_orphan_cbs(rsp, flags);
        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
        mask = rdp->grpmask;    /* rnp->grplo is constant. */
        do {
                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
+                smp_mb__after_unlock_lock();
                rnp->qsmaskinit &= ~mask;
                if (rnp->qsmaskinit != 0) {
                        if (rnp != rdp->mynode)
@@ -2202,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
                cond_resched();
                mask = 0;
                raw_spin_lock_irqsave(&rnp->lock, flags);
+                smp_mb__after_unlock_lock();
                if (!rcu_gp_in_progress(rsp)) {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
@@ -2231,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
        rnp = rcu_get_root(rsp);
        if (rnp->qsmask == 0) {
                raw_spin_lock_irqsave(&rnp->lock, flags);
+                smp_mb__after_unlock_lock();
                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
        }
 }
@@ -2263,6 +2313,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
        /* Reached the root of the rcu_node tree, acquire lock. */
        raw_spin_lock_irqsave(&rnp_old->lock, flags);
+        smp_mb__after_unlock_lock();
        raw_spin_unlock(&rnp_old->fqslock);
        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
                rsp->n_force_qs_lh++;
@@ -2303,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        /* If there are callbacks ready, invoke them. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
                invoke_rcu_callbacks(rsp, rdp);
+        /* Do any needed deferred wakeups of rcuo kthreads. */
+        do_nocb_deferred_wakeup(rdp);
 }
 /*
@@ -2378,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
                        struct rcu_node *rnp_root = rcu_get_root(rsp);
                        raw_spin_lock(&rnp_root->lock);
+                        smp_mb__after_unlock_lock();
                        rcu_start_gp(rsp);
                        raw_spin_unlock(&rnp_root->lock);
                } else {
@@ -2437,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                if (cpu != -1)
                        rdp = per_cpu_ptr(rsp->rda, cpu);
-                offline = !__call_rcu_nocb(rdp, head, lazy);
+                offline = !__call_rcu_nocb(rdp, head, lazy, flags);
                WARN_ON_ONCE(offline);
                /* _call_rcu() is illegal on offline CPU; leak the callback. */
                local_irq_restore(flags);
@@ -2757,6 +2812,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Check for CPU stalls, if enabled. */
        check_cpu_stall(rsp, rdp);
+        /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
+        if (rcu_nohz_full_cpu(rsp))
+                return 0;
        /* Is the RCU core waiting for a quiescent state from this CPU? */
        if (rcu_scheduler_fully_active &&
            rdp->qs_pending && !rdp->passed_quiesce) {
@@ -2790,6 +2849,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
                return 1;
        }
+        /* Does this CPU need a deferred NOCB wakeup? */
+        if (rcu_nocb_need_deferred_wakeup(rdp)) {
+                rdp->n_rp_nocb_defer_wakeup++;
+                return 1;
+        }
        /* nothing to do */
        rdp->n_rp_need_nothing++;
        return 0;
@@ -3214,9 +3279,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
-        for (i = rcu_num_lvls - 1; i > 0; i--)
+        rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+        for (i = rcu_num_lvls - 2; i >= 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-        rsp->levelspread[0] = rcu_fanout_leaf;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -3346,6 +3411,8 @@ static void __init rcu_init_geometry(void)
        if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
            nr_cpu_ids == NR_CPUS)
                return;
+        pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+                rcu_fanout_leaf, nr_cpu_ids);
        /*
         * Compute number of nodes that can be handled an rcu_node tree
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 52be957c9fe2..8c19873f1ac9 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -317,6 +317,7 @@ struct rcu_data {
        unsigned long n_rp_cpu_needs_gp;
        unsigned long n_rp_gp_completed;
        unsigned long n_rp_gp_started;
+        unsigned long n_rp_nocb_defer_wakeup;
        unsigned long n_rp_need_nothing;
        /* 6) _rcu_barrier() and OOM callbacks. */
@@ -335,6 +336,7 @@ struct rcu_data {
        int nocb_p_count_lazy;          /*  (approximate). */
        wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
        struct task_struct *nocb_kthread;
+        bool nocb_defer_wakeup;         /* Defer wakeup of nocb_kthread. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        /* 8) RCU CPU stall data. */
@@ -453,6 +455,8 @@ struct rcu_state {
                                                /*  but in jiffies. */
        unsigned long jiffies_stall;            /* Time at which to check */
                                                /*  for CPU stalls. */
+        unsigned long jiffies_resched;          /* Time at which to resched */
+                                                /*  a reluctant CPU. */
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
        const char *name;                       /* Name of structure. */
@@ -548,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
-                            bool lazy);
+                            bool lazy, unsigned long flags);
 static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                      struct rcu_data *rdp);
+                                      struct rcu_data *rdp,
+                                      unsigned long flags);
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
 static void rcu_kick_nohz_cpu(int cpu);
@@ -564,6 +571,7 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
                                  unsigned long maxj);
 static void rcu_bind_gp_kthread(void);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
+static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6abb03dff5c0..6e2ef4b2b920 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -204,6 +204,7 @@ static void rcu_preempt_note_context_switch(int cpu)
                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
+                smp_mb__after_unlock_lock();
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
                t->rcu_blocked_node = rnp;
@@ -312,6 +313,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
        mask = rnp->grpmask;
        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
+        smp_mb__after_unlock_lock();
        rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 }
@@ -361,10 +363,14 @@ void rcu_read_unlock_special(struct task_struct *t)
        special = t->rcu_read_unlock_special;
        if (special & RCU_READ_UNLOCK_NEED_QS) {
                rcu_preempt_qs(smp_processor_id());
+                if (!t->rcu_read_unlock_special) {
+                        local_irq_restore(flags);
+                        return;
+                }
        }
-        /* Hardware IRQ handlers cannot block. */
+        /* Hardware IRQ handlers cannot block, complain if they get here. */
-        if (in_irq() || in_serving_softirq()) {
+        if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
                local_irq_restore(flags);
                return;
        }
@@ -381,6 +387,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                for (;;) {
                        rnp = t->rcu_blocked_node;
                        raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
+                        smp_mb__after_unlock_lock();
                        if (rnp == t->rcu_blocked_node)
                                break;
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
@@ -605,6 +612,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
        while (!list_empty(lp)) {
                t = list_entry(lp->next, typeof(*t), rcu_node_entry);
                raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+                smp_mb__after_unlock_lock();
                list_del(&t->rcu_node_entry);
                t->rcu_blocked_node = rnp_root;
                list_add(&t->rcu_node_entry, lp_root);
@@ -629,6 +637,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
         * in this case.
         */
        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+        smp_mb__after_unlock_lock();
        if (rnp_root->boost_tasks != NULL &&
            rnp_root->boost_tasks != rnp_root->gp_tasks &&
            rnp_root->boost_tasks != rnp_root->exp_tasks)
@@ -772,6 +781,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
        unsigned long mask;
        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
        for (;;) {
                if (!sync_rcu_preempt_exp_done(rnp)) {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -779,14 +789,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                }
                if (rnp->parent == NULL) {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        if (wake)
+                        if (wake) {
+                                smp_mb(); /* EGP done before wake_up(). */
                                wake_up(&sync_rcu_preempt_exp_wq);
+                        }
                        break;
                }
                mask = rnp->grpmask;
                raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
                rnp = rnp->parent;
                raw_spin_lock(&rnp->lock); /* irqs already disabled */
+                smp_mb__after_unlock_lock();
                rnp->expmask &= ~mask;
        }
 }
@@ -806,6 +819,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
        int must_wait = 0;
        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
        if (list_empty(&rnp->blkd_tasks)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else {
@@ -886,6 +900,7 @@ void synchronize_rcu_expedited(void)
        /* Initialize ->expmask for all non-leaf rcu_node structures. */
        rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
                raw_spin_lock_irqsave(&rnp->lock, flags);
+                smp_mb__after_unlock_lock();
                rnp->expmask = rnp->qsmaskinit;
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
@@ -1191,6 +1206,7 @@ static int rcu_boost(struct rcu_node *rnp)
                return 0;  /* Nothing left to boost. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
        /*
         * Recheck under the lock: all tasks in need of boosting
@@ -1377,6 +1393,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
        if (IS_ERR(t))
                return PTR_ERR(t);
        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
        rnp->boost_kthread_task = t;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        sp.sched_priority = RCU_BOOST_PRIO;
@@ -1632,7 +1649,7 @@ module_param(rcu_idle_gp_delay, int, 0644);
 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
 module_param(rcu_idle_lazy_gp_delay, int, 0644);
-extern int tick_nohz_enabled;
+extern int tick_nohz_active;
 /*
 * Try to advance callbacks for all flavors of RCU on the current CPU, but
@@ -1729,7 +1746,7 @@ static void rcu_prepare_for_idle(int cpu)
        int tne;
        /* Handle nohz enablement switches conservatively. */
-        tne = ACCESS_ONCE(tick_nohz_enabled);
+        tne = ACCESS_ONCE(tick_nohz_active);
        if (tne != rdtp->tick_nohz_enabled_snap) {
                if (rcu_cpu_has_callbacks(cpu, NULL))
                        invoke_rcu_core(); /* force nohz to see update. */
@@ -1769,6 +1786,7 @@ static void rcu_prepare_for_idle(int cpu)
                        continue;
                rnp = rdp->mynode;
                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                smp_mb__after_unlock_lock();
                rcu_accelerate_cbs(rsp, rnp, rdp);
                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
@@ -1852,6 +1870,7 @@ static int rcu_oom_notify(struct notifier_block *self,
        /* Wait for callbacks from earlier instance to complete. */
        wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
+        smp_mb(); /* Ensure callback reuse happens after callback invocation. */
        /*
         * Prevent premature wakeup: ensure that all increments happen
@@ -2101,7 +2120,8 @@ bool rcu_is_nocb_cpu(int cpu)
 static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                                    struct rcu_head *rhp,
                                    struct rcu_head **rhtp,
-                                    int rhcount, int rhcount_lazy)
+                                    int rhcount, int rhcount_lazy,
+                                    unsigned long flags)
 {
        int len;
        struct rcu_head **old_rhpp;
@@ -2122,9 +2142,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        }
        len = atomic_long_read(&rdp->nocb_q_count);
        if (old_rhpp == &rdp->nocb_head) {
-                wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
+                if (!irqs_disabled_flags(flags)) {
+                        wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            TPS("WakeEmpty"));
+                } else {
+                        rdp->nocb_defer_wakeup = true;
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            TPS("WakeEmptyIsDeferred"));
+                }
                rdp->qlen_last_fqs_check = 0;
-                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
                wake_up_process(t); /* ... or if many callbacks queued. */
                rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2145,12 +2172,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 * "rcuo" kthread can find it.
 */
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
-                            bool lazy)
+                            bool lazy, unsigned long flags)
 {
        if (!rcu_is_nocb_cpu(rdp->cpu))
                return 0;
-        __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+        __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
        if (__is_kfree_rcu_offset((unsigned long)rhp->func))
                trace_rcu_kfree_callback(rdp->rsp->name, rhp,
                                         (unsigned long)rhp->func,
@@ -2168,7 +2195,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 * not a no-CBs CPU.
 */
 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                                     struct rcu_data *rdp)
+                                                     struct rcu_data *rdp,
+                                                     unsigned long flags)
 {
        long ql = rsp->qlen;
        long qll = rsp->qlen_lazy;
@@ -2182,14 +2210,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
        /* First, enqueue the donelist, if any.  This preserves CB ordering. */
        if (rsp->orphan_donelist != NULL) {
                __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
-                                        rsp->orphan_donetail, ql, qll);
+                                        rsp->orphan_donetail, ql, qll, flags);
                ql = qll = 0;
                rsp->orphan_donelist = NULL;
                rsp->orphan_donetail = &rsp->orphan_donelist;
        }
        if (rsp->orphan_nxtlist != NULL) {
                __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
-                                        rsp->orphan_nxttail, ql, qll);
+                                        rsp->orphan_nxttail, ql, qll, flags);
                ql = qll = 0;
                rsp->orphan_nxtlist = NULL;
                rsp->orphan_nxttail = &rsp->orphan_nxtlist;
@@ -2209,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
        struct rcu_node *rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
        c = rcu_start_future_gp(rnp, rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -2250,6 +2279,7 @@ static int rcu_nocb_kthread(void *arg)
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("Sleep"));
                        wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+                        /* Memory barrier provide by xchg() below. */
                } else if (firsttime) {
                        firsttime = 0;
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -2310,6 +2340,22 @@ static int rcu_nocb_kthread(void *arg)
        return 0;
 }
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+        return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+}
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+        if (!rcu_nocb_need_deferred_wakeup(rdp))
+                return;
+        ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
+        wake_up(&rdp->nocb_wq);
+        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
+}
 /* Initialize per-rcu_data variables for no-CBs CPUs. */
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
@@ -2365,13 +2411,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 }
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
-                            bool lazy)
+                            bool lazy, unsigned long flags)
 {
        return 0;
 }
 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                                     struct rcu_data *rdp)
+                                                     struct rcu_data *rdp,
+                                                     unsigned long flags)
 {
        return 0;
 }
@@ -2380,6 +2427,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 }
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+        return false;
+}
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+}
 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
 {
 }
@@ -2829,3 +2885,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
 }
 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+/*
+ * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
+ * grace-period kthread will do force_quiescent_state() processing?
+ * The idea is to avoid waking up RCU core processing on such a
+ * CPU unless the grace period has extended for too long.
+ *
+ * This code relies on the fact that all NO_HZ_FULL CPUs are also
+ * CONFIG_RCU_NOCB_CPUs.
+ */
+static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
+{
+#ifdef CONFIG_NO_HZ_FULL
+        if (tick_nohz_full_cpu(smp_processor_id()) &&
+            (!rcu_gp_in_progress(rsp) ||
+             ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
+                return 1;
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+        return 0;
+}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3596797b7e46..4def475336d4 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
                   rdp->n_rp_report_qs,
                   rdp->n_rp_cb_ready,
                   rdp->n_rp_cpu_needs_gp);
-        seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
+        seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
                   rdp->n_rp_gp_completed,
                   rdp->n_rp_gp_started,
+                   rdp->n_rp_nocb_defer_wakeup,
                   rdp->n_rp_need_nothing);
 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 6cb3dff89e2b..802365ccd591 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -128,6 +128,11 @@ struct lockdep_map rcu_sched_lock_map =
        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
 EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
+static struct lock_class_key rcu_callback_key;
+struct lockdep_map rcu_callback_map =
+        STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
+EXPORT_SYMBOL_GPL(rcu_callback_map);
 int notrace debug_lockdep_rcu_enabled(void)
 {
        return rcu_scheduler_active && debug_locks &&
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f813b3474646..662c83fc16b7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
-static void migrate_to_reboot_cpu(void)
+void migrate_to_reboot_cpu(void)
 {
        /* The boot cpu is always logical cpu 0 */
        int cpu = reboot_cpu;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7b621409cf15..9a95c8c2af2a 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
-obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
+obj-y += core.o proc.o clock.o cputime.o
+obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o
-obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c3ae1446461c..6bd6a6731b21 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -26,9 +26,10 @@
 * at 0 on boot (but people really shouldn't rely on that).
 *
 * cpu_clock(i)       -- can be used from any context, including NMI.
- * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
 * local_clock()      -- is cpu_clock() on the current cpu.
 *
+ * sched_clock_cpu(i)
+ *
 * How:
 *
 * The implementation either uses sched_clock() when
@@ -50,15 +51,6 @@
 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
 * that is otherwise invisible (TSC gets stopped).
 *
- *
- * Notes:
- *
- * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
- * like cpufreq interrupts that can change the base clock (TSC) multiplier
- * and cause funny jumps in time -- although the filtering provided by
- * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
- * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
- * sched_clock().
 */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
@@ -66,6 +58,8 @@
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/static_key.h>
+#include <linux/workqueue.h>
 /*
 * Scheduler clock - returns current time in nanosec units.
@@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock);
 __read_mostly int sched_clock_running;
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__read_mostly int sched_clock_stable;
+static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
+int sched_clock_stable(void)
+{
+        if (static_key_false(&__sched_clock_stable))
+                return false;
+        return true;
+}
+void set_sched_clock_stable(void)
+{
+        if (!sched_clock_stable())
+                static_key_slow_dec(&__sched_clock_stable);
+}
+static void __clear_sched_clock_stable(struct work_struct *work)
+{
+        /* XXX worry about clock continuity */
+        if (sched_clock_stable())
+                static_key_slow_inc(&__sched_clock_stable);
+}
+static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+void clear_sched_clock_stable(void)
+{
+        if (keventd_up())
+                schedule_work(&sched_clock_work);
+        else
+                __clear_sched_clock_stable(&sched_clock_work);
+}
 struct sched_clock_data {
        u64                     tick_raw;
@@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu)
        struct sched_clock_data *scd;
        u64 clock;
-        WARN_ON_ONCE(!irqs_disabled());
+        if (sched_clock_stable())
-        if (sched_clock_stable)
                return sched_clock();
        if (unlikely(!sched_clock_running))
                return 0ull;
+        preempt_disable();
        scd = cpu_sdc(cpu);
        if (cpu != smp_processor_id())
                clock = sched_clock_remote(scd);
        else
                clock = sched_clock_local(scd);
+        preempt_enable();
        return clock;
 }
@@ -265,7 +289,7 @@ void sched_clock_tick(void)
        struct sched_clock_data *scd;
        u64 now, now_gtod;
-        if (sched_clock_stable)
+        if (sched_clock_stable())
                return;
        if (unlikely(!sched_clock_running))
@@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 */
 u64 cpu_clock(int cpu)
 {
-        u64 clock;
+        if (static_key_false(&__sched_clock_stable))
-        unsigned long flags;
+                return sched_clock_cpu(cpu);
-        local_irq_save(flags);
-        clock = sched_clock_cpu(cpu);
-        local_irq_restore(flags);
-        return clock;
+        return sched_clock();
 }
 /*
@@ -335,14 +355,10 @@ u64 cpu_clock(int cpu)
 */
 u64 local_clock(void)
 {
-        u64 clock;
+        if (static_key_false(&__sched_clock_stable))
-        unsigned long flags;
+                return sched_clock_cpu(raw_smp_processor_id());
-        local_irq_save(flags);
+        return sched_clock();
-        clock = sched_clock_cpu(smp_processor_id());
-        local_irq_restore(flags);
-        return clock;
 }
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu)
 u64 cpu_clock(int cpu)
 {
-        return sched_clock_cpu(cpu);
+        return sched_clock();
 }
 u64 local_clock(void)
 {
-        return sched_clock_cpu(0);
+        return sched_clock();
 }
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1808606ee5f..4d6964e49711 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;
 /*
 * __task_rq_lock - lock the rq @p resides on.
 */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
 {
        int prio;
-        if (task_has_rt_policy(p))
+        if (task_has_dl_policy(p))
+                prio = MAX_DL_PRIO-1;
+        else if (task_has_rt_policy(p))
                prio = MAX_RT_PRIO-1 - p->rt_priority;
        else
                prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                if (prev_class->switched_from)
                        prev_class->switched_from(rq, p);
                p->sched_class->switched_to(rq, p);
-        } else if (oldprio != p->prio)
+        } else if (oldprio != p->prio || dl_task(p))
                p->sched_class->prio_changed(rq, p, oldprio);
 }
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
        if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
                goto out;
+        trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
        ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
 out:
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void)
         * TIF_NEED_RESCHED remotely (for the first time) will also send
         * this IPI.
         */
-        if (tif_need_resched())
+        preempt_fold_need_resched();
-                set_preempt_need_resched();
        if (llist_empty(&this_rq()->wake_list)
                        && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
+        RB_CLEAR_NODE(&p->dl.rb_node);
+        hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        p->dl.dl_runtime = p->dl.runtime = 0;
+        p->dl.dl_deadline = p->dl.deadline = 0;
+        p->dl.dl_period = 0;
+        p->dl.flags = 0;
        INIT_LIST_HEAD(&p->rt.run_list);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1768,7 +1775,7 @@ void set_numabalancing_state(bool enabled)
 /*
 * fork()/clone()-time setup:
 */
-void sched_fork(unsigned long clone_flags, struct task_struct *p)
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long flags;
        int cpu = get_cpu();
@@ -1790,7 +1797,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
         * Revert to default priority/policy on fork if requested.
         */
        if (unlikely(p->sched_reset_on_fork)) {
-                if (task_has_rt_policy(p)) {
+                if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                        p->policy = SCHED_NORMAL;
                        p->static_prio = NICE_TO_PRIO(0);
                        p->rt_priority = 0;
@@ -1807,8 +1814,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
                p->sched_reset_on_fork = 0;
        }
-        if (!rt_prio(p->prio))
+        if (dl_prio(p->prio)) {
+                put_cpu();
+                return -EAGAIN;
+        } else if (rt_prio(p->prio)) {
+                p->sched_class = &rt_sched_class;
+        } else {
                p->sched_class = &fair_sched_class;
+        }
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
@@ -1834,11 +1847,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
        init_task_preempt_count(p);
 #ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
+        RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
        put_cpu();
+        return 0;
+}
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+        if (runtime == RUNTIME_INF)
+                return 1ULL << 20;
+        /*
+         * Doing this here saves a lot of checks in all
+         * the calling paths, and returning zero seems
+         * safe for them anyway.
+         */
+        if (period == 0)
+                return 0;
+        return div64_u64(runtime << 20, period);
+}
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+        return &cpu_rq(i)->rd->dl_bw;
+}
+static inline int dl_bw_cpus(int i)
+{
+        struct root_domain *rd = cpu_rq(i)->rd;
+        int cpus = 0;
+        for_each_cpu_and(i, rd->span, cpu_active_mask)
+                cpus++;
+        return cpus;
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+        return &cpu_rq(i)->dl.dl_bw;
+}
+static inline int dl_bw_cpus(int i)
+{
+        return 1;
+}
+#endif
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+        dl_b->total_bw -= tsk_bw;
+}
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+        dl_b->total_bw += tsk_bw;
+}
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+        return dl_b->bw != -1 &&
+               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+                       const struct sched_attr *attr)
+{
+        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+        u64 period = attr->sched_period;
+        u64 runtime = attr->sched_runtime;
+        u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+        int cpus, err = -1;
+        if (new_bw == p->dl.dl_bw)
+                return 0;
+        /*
+         * Either if a task, enters, leave, or stays -deadline but changes
+         * its parameters, we may need to update accordingly the total
+         * allocated bandwidth of the container.
+         */
+        raw_spin_lock(&dl_b->lock);
+        cpus = dl_bw_cpus(task_cpu(p));
+        if (dl_policy(policy) && !task_has_dl_policy(p) &&
+            !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+                __dl_add(dl_b, new_bw);
+                err = 0;
+        } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+                   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+                __dl_clear(dl_b, p->dl.dl_bw);
+                __dl_add(dl_b, new_bw);
+                err = 0;
+        } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+                __dl_clear(dl_b, p->dl.dl_bw);
+                err = 0;
+        }
+        raw_spin_unlock(&dl_b->lock);
+        return err;
 }
+extern void init_dl_bw(struct dl_bw *dl_b);
 /*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
@@ -2003,6 +2129,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (unlikely(prev_state == TASK_DEAD)) {
                task_numa_free(prev);
+                if (prev->sched_class->task_dead)
+                        prev->sched_class->task_dead(prev);
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -2296,7 +2425,7 @@ void scheduler_tick(void)
 #ifdef CONFIG_SMP
        rq->idle_balance = idle_cpu(cpu);
-        trigger_load_balance(rq, cpu);
+        trigger_load_balance(rq);
 #endif
        rq_last_tick_reset(rq);
 }
@@ -2414,10 +2543,10 @@ static inline void schedule_debug(struct task_struct *prev)
 {
        /*
         * Test if we are atomic. Since do_exit() needs to call into
-         * schedule() atomically, we ignore that path for now.
+         * schedule() atomically, we ignore that path. Otherwise whine
-         * Otherwise, whine if we are scheduling when we should not be.
+         * if we are scheduling when we should not.
         */
-        if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+        if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
                __schedule_bug(prev);
        rcu_sleep_check();
@@ -2660,6 +2789,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
        } while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
 /*
 * this is the entry point to schedule() from kernel preemption
@@ -2693,8 +2823,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
        exception_exit(prev_state);
 }
-#endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
                          void *key)
 {
@@ -2762,11 +2890,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        int oldprio, on_rq, running;
+        int oldprio, on_rq, running, enqueue_flag = 0;
        struct rq *rq;
        const struct sched_class *prev_class;
-        BUG_ON(prio < 0 || prio > MAX_PRIO);
+        BUG_ON(prio > MAX_PRIO);
        rq = __task_rq_lock(p);
@@ -2789,6 +2917,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        }
        trace_sched_pi_setprio(p, prio);
+        p->pi_top_task = rt_mutex_get_top_task(p);
        oldprio = p->prio;
        prev_class = p->sched_class;
        on_rq = p->on_rq;
@@ -2798,23 +2927,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->put_prev_task(rq, p);
-        if (rt_prio(prio))
+        /*
+         * Boosting condition are:
+         * 1. -rt task is running and holds mutex A
+         *      --> -dl task blocks on mutex A
+         *
+         * 2. -dl task is running and holds mutex A
+         *      --> -dl task blocks on mutex A and could preempt the
+         *          running task
+         */
+        if (dl_prio(prio)) {
+                if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
+                        dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+                        p->dl.dl_boosted = 1;
+                        p->dl.dl_throttled = 0;
+                        enqueue_flag = ENQUEUE_REPLENISH;
+                } else
+                        p->dl.dl_boosted = 0;
+                p->sched_class = &dl_sched_class;
+        } else if (rt_prio(prio)) {
+                if (dl_prio(oldprio))
+                        p->dl.dl_boosted = 0;
+                if (oldprio < prio)
+                        enqueue_flag = ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
-        else
+        } else {
+                if (dl_prio(oldprio))
+                        p->dl.dl_boosted = 0;
                p->sched_class = &fair_sched_class;
+        }
        p->prio = prio;
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq)
-                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+                enqueue_task(rq, p, enqueue_flag);
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
        __task_rq_unlock(rq);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
        int old_prio, delta, on_rq;
@@ -2832,9 +2987,9 @@ void set_user_nice(struct task_struct *p, long nice)
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
         * it wont have any effect on scheduling until the task is
-         * SCHED_FIFO/SCHED_RR:
+         * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
         */
-        if (task_has_rt_policy(p)) {
+        if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
@@ -2989,22 +3144,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
        return pid ? find_task_by_vpid(pid) : current;
 }
-/* Actually do priority change: must hold rq lock. */
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
 static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        init_dl_task_timer(dl_se);
+        dl_se->dl_runtime = attr->sched_runtime;
+        dl_se->dl_deadline = attr->sched_deadline;
+        dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+        dl_se->flags = attr->sched_flags;
+        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+        dl_se->dl_throttled = 0;
+        dl_se->dl_new = 1;
+}
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+                           const struct sched_attr *attr)
 {
+        int policy = attr->sched_policy;
+        if (policy == -1) /* setparam */
+                policy = p->policy;
        p->policy = policy;
-        p->rt_priority = prio;
+        if (dl_policy(policy))
+                __setparam_dl(p, attr);
+        else if (fair_policy(policy))
+                p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+        /*
+         * __sched_setscheduler() ensures attr->sched_priority == 0 when
+         * !rt_policy. Always setting this ensures that things like
+         * getparam()/getattr() don't report silly values for !rt tasks.
+         */
+        p->rt_priority = attr->sched_priority;
        p->normal_prio = normal_prio(p);
-        /* we are holding p->pi_lock already */
        p->prio = rt_mutex_getprio(p);
-        if (rt_prio(p->prio))
+        if (dl_prio(p->prio))
+                p->sched_class = &dl_sched_class;
+        else if (rt_prio(p->prio))
                p->sched_class = &rt_sched_class;
        else
                p->sched_class = &fair_sched_class;
        set_load_weight(p);
 }
+static void
+__getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        attr->sched_priority = p->rt_priority;
+        attr->sched_runtime = dl_se->dl_runtime;
+        attr->sched_deadline = dl_se->dl_deadline;
+        attr->sched_period = dl_se->dl_period;
+        attr->sched_flags = dl_se->flags;
+}
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution (1us); we
+ * check sched_runtime only since it is always the smaller one.
+ */
+static bool
+__checkparam_dl(const struct sched_attr *attr)
+{
+        return attr && attr->sched_deadline != 0 &&
+                (attr->sched_period == 0 ||
+                (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
+                (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
+                attr->sched_runtime >= (2 << (DL_SCALE - 1));
+}
 /*
 * check the target process has a UID that matches the current process's
 */
@@ -3021,10 +3249,12 @@ static bool check_same_owner(struct task_struct *p)
        return match;
 }
-static int __sched_setscheduler(struct task_struct *p, int policy,
+static int __sched_setscheduler(struct task_struct *p,
-                                const struct sched_param *param, bool user)
+                                const struct sched_attr *attr,
+                                bool user)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
+        int policy = attr->sched_policy;
        unsigned long flags;
        const struct sched_class *prev_class;
        struct rq *rq;
@@ -3038,31 +3268,40 @@ recheck:
                reset_on_fork = p->sched_reset_on_fork;
                policy = oldpolicy = p->policy;
        } else {
-                reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+                reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-                policy &= ~SCHED_RESET_ON_FORK;
-                if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                if (policy != SCHED_DEADLINE &&
+                                policy != SCHED_FIFO && policy != SCHED_RR &&
                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
                                policy != SCHED_IDLE)
                        return -EINVAL;
        }
+        if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+                return -EINVAL;
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
         * SCHED_BATCH and SCHED_IDLE is 0.
         */
-        if (param->sched_priority < 0 ||
+        if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
-            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+            (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
-            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-        if (rt_policy(policy) != (param->sched_priority != 0))
+        if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+            (rt_policy(policy) != (attr->sched_priority != 0)))
                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (user && !capable(CAP_SYS_NICE)) {
+                if (fair_policy(policy)) {
+                        if (attr->sched_nice < TASK_NICE(p) &&
+                            !can_nice(p, attr->sched_nice))
+                                return -EPERM;
+                }
                if (rt_policy(policy)) {
                        unsigned long rlim_rtprio =
                                        task_rlimit(p, RLIMIT_RTPRIO);
@@ -3072,8 +3311,8 @@ recheck:
                                return -EPERM;
                        /* can't increase priority */
-                        if (param->sched_priority > p->rt_priority &&
+                        if (attr->sched_priority > p->rt_priority &&
-                            param->sched_priority > rlim_rtprio)
+                            attr->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
@@ -3121,14 +3360,21 @@ recheck:
        /*
         * If not changing anything there's no need to proceed further:
         */
-        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+        if (unlikely(policy == p->policy)) {
-                        param->sched_priority == p->rt_priority))) {
+                if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                        goto change;
+                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+                        goto change;
+                if (dl_policy(policy))
+                        goto change;
                task_rq_unlock(rq, p, &flags);
                return 0;
        }
+change:
-#ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
                /*
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
@@ -3139,8 +3385,24 @@ recheck:
                        task_rq_unlock(rq, p, &flags);
                        return -EPERM;
                }
-        }
 #endif
+#ifdef CONFIG_SMP
+                if (dl_bandwidth_enabled() && dl_policy(policy)) {
+                        cpumask_t *span = rq->rd->span;
+                        /*
+                         * Don't allow tasks with an affinity mask smaller than
+                         * the entire root_domain to become SCHED_DEADLINE. We
+                         * will also fail if there's no bandwidth available.
+                         */
+                        if (!cpumask_subset(span, &p->cpus_allowed) ||
+                            rq->rd->dl_bw.bw == 0) {
+                                task_rq_unlock(rq, p, &flags);
+                                return -EPERM;
+                        }
+                }
+#endif
+        }
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3148,6 +3410,17 @@ recheck:
                task_rq_unlock(rq, p, &flags);
                goto recheck;
        }
+        /*
+         * If setscheduling to SCHED_DEADLINE (or changing the parameters
+         * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+         * is available.
+         */
+        if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+                task_rq_unlock(rq, p, &flags);
+                return -EBUSY;
+        }
        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -3159,7 +3432,7 @@ recheck:
        oldprio = p->prio;
        prev_class = p->sched_class;
-        __setscheduler(rq, p, policy, param->sched_priority);
+        __setscheduler(rq, p, attr);
        if (running)
                p->sched_class->set_curr_task(rq);
@@ -3174,6 +3447,26 @@ recheck:
        return 0;
 }
+static int _sched_setscheduler(struct task_struct *p, int policy,
+                               const struct sched_param *param, bool check)
+{
+        struct sched_attr attr = {
+                .sched_policy   = policy,
+                .sched_priority = param->sched_priority,
+                .sched_nice     = PRIO_TO_NICE(p->static_prio),
+        };
+        /*
+         * Fixup the legacy SCHED_RESET_ON_FORK hack
+         */
+        if (policy & SCHED_RESET_ON_FORK) {
+                attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+                policy &= ~SCHED_RESET_ON_FORK;
+                attr.sched_policy = policy;
+        }
+        return __sched_setscheduler(p, &attr, check);
+}
 /**
 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
 * @p: the task in question.
@@ -3187,10 +3480,16 @@ recheck:
 int sched_setscheduler(struct task_struct *p, int policy,
                       const struct sched_param *param)
 {
-        return __sched_setscheduler(p, policy, param, true);
+        return _sched_setscheduler(p, policy, param, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+        return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
 /**
 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
 * @p: the task in question.
@@ -3207,7 +3506,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                               const struct sched_param *param)
 {
-        return __sched_setscheduler(p, policy, param, false);
+        return _sched_setscheduler(p, policy, param, false);
 }
 static int
@@ -3232,6 +3531,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
        return retval;
 }
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+                           struct sched_attr *attr)
+{
+        u32 size;
+        int ret;
+        if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+                return -EFAULT;
+        /*
+         * zero the full structure, so that a short copy will be nice.
+         */
+        memset(attr, 0, sizeof(*attr));
+        ret = get_user(size, &uattr->size);
+        if (ret)
+                return ret;
+        if (size > PAGE_SIZE)   /* silly large */
+                goto err_size;
+        if (!size)              /* abi compat */
+                size = SCHED_ATTR_SIZE_VER0;
+        if (size < SCHED_ATTR_SIZE_VER0)
+                goto err_size;
+        /*
+         * If we're handed a bigger struct than we know of,
+         * ensure all the unknown bits are 0 - i.e. new
+         * user-space does not rely on any kernel feature
+         * extensions we dont know about yet.
+         */
+        if (size > sizeof(*attr)) {
+                unsigned char __user *addr;
+                unsigned char __user *end;
+                unsigned char val;
+                addr = (void __user *)uattr + sizeof(*attr);
+                end  = (void __user *)uattr + size;
+                for (; addr < end; addr++) {
+                        ret = get_user(val, addr);
+                        if (ret)
+                                return ret;
+                        if (val)
+                                goto err_size;
+                }
+                size = sizeof(*attr);
+        }
+        ret = copy_from_user(attr, uattr, size);
+        if (ret)
+                return -EFAULT;
+        /*
+         * XXX: do we want to be lenient like existing syscalls; or do we want
+         * to be strict and return an error on out-of-bounds values?
+         */
+        attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+out:
+        return ret;
+err_size:
+        put_user(sizeof(*attr), &uattr->size);
+        ret = -E2BIG;
+        goto out;
+}
 /**
 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
 * @pid: the pid in question.
@@ -3263,6 +3635,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 }
 /**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+{
+        struct sched_attr attr;
+        struct task_struct *p;
+        int retval;
+        if (!uattr || pid < 0)
+                return -EINVAL;
+        if (sched_copy_attr(uattr, &attr))
+                return -EFAULT;
+        rcu_read_lock();
+        retval = -ESRCH;
+        p = find_process_by_pid(pid);
+        if (p != NULL)
+                retval = sched_setattr(p, &attr);
+        rcu_read_unlock();
+        return retval;
+}
+/**
 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
 * @pid: the pid in question.
 *
@@ -3317,6 +3716,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (retval)
                goto out_unlock;
+        if (task_has_dl_policy(p)) {
+                retval = -EINVAL;
+                goto out_unlock;
+        }
        lp.sched_priority = p->rt_priority;
        rcu_read_unlock();
@@ -3332,6 +3735,96 @@ out_unlock:
        return retval;
 }
+static int sched_read_attr(struct sched_attr __user *uattr,
+                           struct sched_attr *attr,
+                           unsigned int usize)
+{
+        int ret;
+        if (!access_ok(VERIFY_WRITE, uattr, usize))
+                return -EFAULT;
+        /*
+         * If we're handed a smaller struct than we know of,
+         * ensure all the unknown bits are 0 - i.e. old
+         * user-space does not get uncomplete information.
+         */
+        if (usize < sizeof(*attr)) {
+                unsigned char *addr;
+                unsigned char *end;
+                addr = (void *)attr + usize;
+                end  = (void *)attr + sizeof(*attr);
+                for (; addr < end; addr++) {
+                        if (*addr)
+                                goto err_size;
+                }
+                attr->size = usize;
+        }
+        ret = copy_to_user(uattr, attr, usize);
+        if (ret)
+                return -EFAULT;
+out:
+        return ret;
+err_size:
+        ret = -E2BIG;
+        goto out;
+}
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ */
+SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+                unsigned int, size)
+{
+        struct sched_attr attr = {
+                .size = sizeof(struct sched_attr),
+        };
+        struct task_struct *p;
+        int retval;
+        if (!uattr || pid < 0 || size > PAGE_SIZE ||
+            size < SCHED_ATTR_SIZE_VER0)
+                return -EINVAL;
+        rcu_read_lock();
+        p = find_process_by_pid(pid);
+        retval = -ESRCH;
+        if (!p)
+                goto out_unlock;
+        retval = security_task_getscheduler(p);
+        if (retval)
+                goto out_unlock;
+        attr.sched_policy = p->policy;
+        if (p->sched_reset_on_fork)
+                attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+        if (task_has_dl_policy(p))
+                __getparam_dl(p, &attr);
+        else if (task_has_rt_policy(p))
+                attr.sched_priority = p->rt_priority;
+        else
+                attr.sched_nice = TASK_NICE(p);
+        rcu_read_unlock();
+        retval = sched_read_attr(uattr, &attr, size);
+        return retval;
+out_unlock:
+        rcu_read_unlock();
+        return retval;
+}
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
        cpumask_var_t cpus_allowed, new_mask;
@@ -3376,8 +3869,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        if (retval)
                goto out_unlock;
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
+        /*
+         * Since bandwidth control happens on root_domain basis,
+         * if admission test is enabled, we only admit -deadline
+         * tasks allowed to run on all the CPUs in the task's
+         * root_domain.
+         */
+#ifdef CONFIG_SMP
+        if (task_has_dl_policy(p)) {
+                const struct cpumask *span = task_rq(p)->rd->span;
+                if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+                        retval = -EBUSY;
+                        goto out_unlock;
+                }
+        }
+#endif
 again:
        retval = set_cpus_allowed_ptr(p, new_mask);
@@ -3654,7 +4165,7 @@ again:
        }
        double_rq_lock(rq, p_rq);
-        while (task_rq(p) != p_rq) {
+        if (task_rq(p) != p_rq) {
                double_rq_unlock(rq, p_rq);
                goto again;
        }
@@ -3743,6 +4254,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
        case SCHED_RR:
                ret = MAX_USER_RT_PRIO-1;
                break;
+        case SCHED_DEADLINE:
        case SCHED_NORMAL:
        case SCHED_BATCH:
        case SCHED_IDLE:
@@ -3769,6 +4281,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
        case SCHED_RR:
                ret = 1;
                break;
+        case SCHED_DEADLINE:
        case SCHED_NORMAL:
        case SCHED_BATCH:
        case SCHED_IDLE:
@@ -4091,6 +4604,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
        /* TODO: This is not properly updating schedstats */
+        trace_sched_move_numa(p, curr_cpu, target_cpu);
        return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
 }
@@ -4515,13 +5029,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
 static int sched_cpu_inactive(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
+        unsigned long flags;
+        long cpu = (long)hcpu;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-                set_cpu_active((long)hcpu, false);
+                set_cpu_active(cpu, false);
+                /* explicitly allow suspend */
+                if (!(action & CPU_TASKS_FROZEN)) {
+                        struct dl_bw *dl_b = dl_bw_of(cpu);
+                        bool overflow;
+                        int cpus;
+                        raw_spin_lock_irqsave(&dl_b->lock, flags);
+                        cpus = dl_bw_cpus(cpu);
+                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
+                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                        if (overflow)
+                                return notifier_from_errno(-EBUSY);
+                }
                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
        }
+        return NOTIFY_DONE;
 }
 static int __init migration_init(void)
@@ -4740,6 +5272,8 @@ static void free_rootdomain(struct rcu_head *rcu)
        struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
        cpupri_cleanup(&rd->cpupri);
+        cpudl_cleanup(&rd->cpudl);
+        free_cpumask_var(rd->dlo_mask);
        free_cpumask_var(rd->rto_mask);
        free_cpumask_var(rd->online);
        free_cpumask_var(rd->span);
@@ -4762,7 +5296,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                cpumask_clear_cpu(rq->cpu, old_rd->span);
                /*
-                 * If we dont want to free the old_rt yet then
+                 * If we dont want to free the old_rd yet then
                 * set old_rd to NULL to skip the freeing later
                 * in this function:
                 */
@@ -4791,8 +5325,14 @@ static int init_rootdomain(struct root_domain *rd)
                goto out;
        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                goto free_span;
-        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+        if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
                goto free_online;
+        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+                goto free_dlo_mask;
+        init_dl_bw(&rd->dl_bw);
+        if (cpudl_init(&rd->cpudl) != 0)
+                goto free_dlo_mask;
        if (cpupri_init(&rd->cpupri) != 0)
                goto free_rto_mask;
@@ -4800,6 +5340,8 @@ static int init_rootdomain(struct root_domain *rd)
 free_rto_mask:
        free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+        free_cpumask_var(rd->dlo_mask);
 free_online:
        free_cpumask_var(rd->online);
 free_span:
@@ -4903,6 +5445,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
        struct sched_domain *sd;
+        struct sched_domain *busy_sd = NULL;
        int id = cpu;
        int size = 1;
@@ -4910,8 +5453,9 @@ static void update_top_cache_domain(int cpu)
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
-                rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
+                busy_sd = sd->parent; /* sd_busy */
        }
+        rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
@@ -5112,6 +5656,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                 * die on a /0 trap.
                 */
                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                sg->sgp->power_orig = sg->sgp->power;
                /*
                 * Make sure the first group of this domain contains the
@@ -6148,6 +6693,7 @@ void __init sched_init_smp(void)
        free_cpumask_var(non_isolated_cpus);
        init_sched_rt_class();
+        init_sched_dl_class();
 }
 #else
 void __init sched_init_smp(void)
@@ -6217,13 +6763,15 @@ void __init sched_init(void)
 #endif /* CONFIG_CPUMASK_OFFSTACK */
        }
+        init_rt_bandwidth(&def_rt_bandwidth,
+                        global_rt_period(), global_rt_runtime());
+        init_dl_bandwidth(&def_dl_bandwidth,
+                        global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_SMP
        init_defrootdomain();
 #endif
-        init_rt_bandwidth(&def_rt_bandwidth,
-                        global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
@@ -6247,6 +6795,7 @@ void __init sched_init(void)
                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
+                init_dl_rq(&rq->dl, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6318,10 +6867,6 @@ void __init sched_init(void)
        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
-#ifdef CONFIG_RT_MUTEXES
-        plist_head_init(&init_task.pi_waiters);
-#endif
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
@@ -6395,13 +6940,16 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        const struct sched_class *prev_class = p->sched_class;
+        struct sched_attr attr = {
+                .sched_policy = SCHED_NORMAL,
+        };
        int old_prio = p->prio;
        int on_rq;
        on_rq = p->on_rq;
        if (on_rq)
                dequeue_task(rq, p, 0);
-        __setscheduler(rq, p, SCHED_NORMAL, 0);
+        __setscheduler(rq, p, &attr);
        if (on_rq) {
                enqueue_task(rq, p, 0);
                resched_task(rq->curr);
@@ -6431,7 +6979,7 @@ void normalize_rt_tasks(void)
                p->se.statistics.block_start    = 0;
 #endif
-                if (!rt_task(p)) {
+                if (!dl_task(p) && !rt_task(p)) {
                        /*
                         * Renice negative nice level userspace
                         * tasks back to 0:
@@ -6626,16 +7174,6 @@ void sched_move_task(struct task_struct *tsk)
 }
 #endif /* CONFIG_CGROUP_SCHED */
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-        if (runtime == RUNTIME_INF)
-                return 1ULL << 20;
-        return div64_u64(runtime << 20, period);
-}
-#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
 * Ensure that the real time constraints are schedulable.
@@ -6809,24 +7347,13 @@ static long sched_group_rt_period(struct task_group *tg)
        do_div(rt_period_us, NSEC_PER_USEC);
        return rt_period_us;
 }
+#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
 static int sched_rt_global_constraints(void)
 {
-        u64 runtime, period;
        int ret = 0;
-        if (sysctl_sched_rt_period <= 0)
-                return -EINVAL;
-        runtime = global_rt_runtime();
-        period = global_rt_period();
-        /*
-         * Sanity check on the sysctl variables.
-         */
-        if (runtime > period && runtime != RUNTIME_INF)
-                return -EINVAL;
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
        ret = __rt_schedulable(NULL, 0, 0);
@@ -6849,17 +7376,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 static int sched_rt_global_constraints(void)
 {
        unsigned long flags;
-        int i;
+        int i, ret = 0;
-        if (sysctl_sched_rt_period <= 0)
-                return -EINVAL;
-        /*
-         * There's always some RT tasks in the root group
-         * -- migration, kstopmachine etc..
-         */
-        if (sysctl_sched_rt_runtime == 0)
-                return -EBUSY;
        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
@@ -6871,36 +7388,88 @@ static int sched_rt_global_constraints(void)
        }
        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-        return 0;
+        return ret;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
-int sched_rr_handler(struct ctl_table *table, int write,
+static int sched_dl_global_constraints(void)
-                void __user *buffer, size_t *lenp,
-                loff_t *ppos)
 {
-        int ret;
+        u64 runtime = global_rt_runtime();
-        static DEFINE_MUTEX(mutex);
+        u64 period = global_rt_period();
+        u64 new_bw = to_ratio(period, runtime);
+        int cpu, ret = 0;
-        mutex_lock(&mutex);
+        /*
-        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+         * Here we want to check the bandwidth not being set to some
-        /* make sure that internally we keep jiffies */
+         * value smaller than the currently allocated bandwidth in
-        /* also, writing zero resets timeslice to default */
+         * any of the root_domains.
-        if (!ret && write) {
+         *
-                sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+         * FIXME: Cycling on all the CPUs is overdoing, but simpler than
-                        RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+         * cycling on root_domains... Discussion on different/better
+         * solutions is welcome!
+         */
+        for_each_possible_cpu(cpu) {
+                struct dl_bw *dl_b = dl_bw_of(cpu);
+                raw_spin_lock(&dl_b->lock);
+                if (new_bw < dl_b->total_bw)
+                        ret = -EBUSY;
+                raw_spin_unlock(&dl_b->lock);
+                if (ret)
+                        break;
        }
-        mutex_unlock(&mutex);
        return ret;
 }
+static void sched_dl_do_global(void)
+{
+        u64 new_bw = -1;
+        int cpu;
+        def_dl_bandwidth.dl_period = global_rt_period();
+        def_dl_bandwidth.dl_runtime = global_rt_runtime();
+        if (global_rt_runtime() != RUNTIME_INF)
+                new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+        /*
+         * FIXME: As above...
+         */
+        for_each_possible_cpu(cpu) {
+                struct dl_bw *dl_b = dl_bw_of(cpu);
+                raw_spin_lock(&dl_b->lock);
+                dl_b->bw = new_bw;
+                raw_spin_unlock(&dl_b->lock);
+        }
+}
+static int sched_rt_global_validate(void)
+{
+        if (sysctl_sched_rt_period <= 0)
+                return -EINVAL;
+        if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+                return -EINVAL;
+        return 0;
+}
+static void sched_rt_do_global(void)
+{
+        def_rt_bandwidth.rt_runtime = global_rt_runtime();
+        def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
 int sched_rt_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int ret;
        int old_period, old_runtime;
        static DEFINE_MUTEX(mutex);
+        int ret;
        mutex_lock(&mutex);
        old_period = sysctl_sched_rt_period;
@@ -6909,21 +7478,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (!ret && write) {
+                ret = sched_rt_global_validate();
+                if (ret)
+                        goto undo;
                ret = sched_rt_global_constraints();
-                if (ret) {
+                if (ret)
-                        sysctl_sched_rt_period = old_period;
+                        goto undo;
-                        sysctl_sched_rt_runtime = old_runtime;
-                } else {
+                ret = sched_dl_global_constraints();
-                        def_rt_bandwidth.rt_runtime = global_rt_runtime();
+                if (ret)
-                        def_rt_bandwidth.rt_period =
+                        goto undo;
-                                ns_to_ktime(global_rt_period());
-                }
+                sched_rt_do_global();
+                sched_dl_do_global();
+        }
+        if (0) {
+undo:
+                sysctl_sched_rt_period = old_period;
+                sysctl_sched_rt_runtime = old_runtime;
        }
        mutex_unlock(&mutex);
        return ret;
 }
+int sched_rr_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        static DEFINE_MUTEX(mutex);
+        mutex_lock(&mutex);
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        /* make sure that internally we keep jiffies */
+        /* also, writing zero resets timeslice to default */
+        if (!ret && write) {
+                sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+                        RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+        }
+        mutex_unlock(&mutex);
+        return ret;
+}
 #ifdef CONFIG_CGROUP_SCHED
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -7256,15 +7854,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
        return ret;
 }
-static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
+static int cpu_stats_show(struct seq_file *sf, void *v)
-                struct cgroup_map_cb *cb)
 {
-        struct task_group *tg = css_tg(css);
+        struct task_group *tg = css_tg(seq_css(sf));
        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
-        cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+        seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
-        cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+        seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
-        cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+        seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
        return 0;
 }
@@ -7318,7 +7915,7 @@ static struct cftype cpu_files[] = {
        },
        {
                .name = "stat",
-                .read_map = cpu_stats_show,
+                .seq_show = cpu_stats_show,
        },
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722ff0299..622e0818f905 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -163,10 +163,9 @@ out:
        return err;
 }
-static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
-                                   struct cftype *cft, struct seq_file *m)
 {
-        struct cpuacct *ca = css_ca(css);
+        struct cpuacct *ca = css_ca(seq_css(m));
        u64 percpu;
        int i;
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
        [CPUACCT_STAT_SYSTEM] = "system",
 };
-static int cpuacct_stats_show(struct cgroup_subsys_state *css,
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
-                              struct cftype *cft, struct cgroup_map_cb *cb)
 {
-        struct cpuacct *ca = css_ca(css);
+        struct cpuacct *ca = css_ca(seq_css(sf));
        int cpu;
        s64 val = 0;
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
                val += kcpustat->cpustat[CPUTIME_NICE];
        }
        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+        seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
        val = 0;
        for_each_online_cpu(cpu) {
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
        }
        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+        seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
        return 0;
 }
@@ -220,11 +218,11 @@ static struct cftype files[] = {
        },
        {
                .name = "usage_percpu",
-                .read_seq_string = cpuacct_percpu_seq_read,
+                .seq_show = cpuacct_percpu_seq_show,
        },
        {
                .name = "stat",
-                .read_map = cpuacct_stats_show,
+                .seq_show = cpuacct_stats_show,
        },
        { }     /* terminate */
 };
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000000..045fc74e3f09
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,216 @@
+/*
+ *  kernel/sched/cpudl.c
+ *
+ *  Global CPU deadline management
+ *
+ *  Author: Juri Lelli <j.lelli@sssup.it>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include "cpudeadline.h"
+static inline int parent(int i)
+{
+        return (i - 1) >> 1;
+}
+static inline int left_child(int i)
+{
+        return (i << 1) + 1;
+}
+static inline int right_child(int i)
+{
+        return (i << 1) + 2;
+}
+static inline int dl_time_before(u64 a, u64 b)
+{
+        return (s64)(a - b) < 0;
+}
+static void cpudl_exchange(struct cpudl *cp, int a, int b)
+{
+        int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+        swap(cp->elements[a], cp->elements[b]);
+        swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+}
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+        int l, r, largest;
+        /* adapted from lib/prio_heap.c */
+        while(1) {
+                l = left_child(idx);
+                r = right_child(idx);
+                largest = idx;
+                if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
+                                                        cp->elements[l].dl))
+                        largest = l;
+                if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
+                                                        cp->elements[r].dl))
+                        largest = r;
+                if (largest == idx)
+                        break;
+                /* Push idx down the heap one level and bump one up */
+                cpudl_exchange(cp, largest, idx);
+                idx = largest;
+        }
+}
+static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+{
+        WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
+        if (dl_time_before(new_dl, cp->elements[idx].dl)) {
+                cp->elements[idx].dl = new_dl;
+                cpudl_heapify(cp, idx);
+        } else {
+                cp->elements[idx].dl = new_dl;
+                while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+                                        cp->elements[idx].dl)) {
+                        cpudl_exchange(cp, idx, parent(idx));
+                        idx = parent(idx);
+                }
+        }
+}
+static inline int cpudl_maximum(struct cpudl *cp)
+{
+        return cp->elements[0].cpu;
+}
+/*
+ * cpudl_find - find the best (later-dl) CPU in the system
+ * @cp: the cpudl max-heap context
+ * @p: the task
+ * @later_mask: a mask to fill in with the selected CPUs (or NULL)
+ *
+ * Returns: int - best CPU (heap maximum if suitable)
+ */
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+               struct cpumask *later_mask)
+{
+        int best_cpu = -1;
+        const struct sched_dl_entity *dl_se = &p->dl;
+        if (later_mask && cpumask_and(later_mask, cp->free_cpus,
+                        &p->cpus_allowed) && cpumask_and(later_mask,
+                        later_mask, cpu_active_mask)) {
+                best_cpu = cpumask_any(later_mask);
+                goto out;
+        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+                        dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+                best_cpu = cpudl_maximum(cp);
+                if (later_mask)
+                        cpumask_set_cpu(best_cpu, later_mask);
+        }
+out:
+        WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
+        return best_cpu;
+}
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+{
+        int old_idx, new_cpu;
+        unsigned long flags;
+        WARN_ON(cpu > num_present_cpus());
+        raw_spin_lock_irqsave(&cp->lock, flags);
+        old_idx = cp->cpu_to_idx[cpu];
+        if (!is_valid) {
+                /* remove item */
+                if (old_idx == IDX_INVALID) {
+                        /*
+                         * Nothing to remove if old_idx was invalid.
+                         * This could happen if a rq_offline_dl is
+                         * called for a CPU without -dl tasks running.
+                         */
+                        goto out;
+                }
+                new_cpu = cp->elements[cp->size - 1].cpu;
+                cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
+                cp->elements[old_idx].cpu = new_cpu;
+                cp->size--;
+                cp->cpu_to_idx[new_cpu] = old_idx;
+                cp->cpu_to_idx[cpu] = IDX_INVALID;
+                while (old_idx > 0 && dl_time_before(
+                                cp->elements[parent(old_idx)].dl,
+                                cp->elements[old_idx].dl)) {
+                        cpudl_exchange(cp, old_idx, parent(old_idx));
+                        old_idx = parent(old_idx);
+                }
+                cpumask_set_cpu(cpu, cp->free_cpus);
+                cpudl_heapify(cp, old_idx);
+                goto out;
+        }
+        if (old_idx == IDX_INVALID) {
+                cp->size++;
+                cp->elements[cp->size - 1].dl = 0;
+                cp->elements[cp->size - 1].cpu = cpu;
+                cp->cpu_to_idx[cpu] = cp->size - 1;
+                cpudl_change_key(cp, cp->size - 1, dl);
+                cpumask_clear_cpu(cpu, cp->free_cpus);
+        } else {
+                cpudl_change_key(cp, old_idx, dl);
+        }
+out:
+        raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+/*
+ * cpudl_init - initialize the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+int cpudl_init(struct cpudl *cp)
+{
+        int i;
+        memset(cp, 0, sizeof(*cp));
+        raw_spin_lock_init(&cp->lock);
+        cp->size = 0;
+        for (i = 0; i < NR_CPUS; i++)
+                cp->cpu_to_idx[i] = IDX_INVALID;
+        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+                return -ENOMEM;
+        cpumask_setall(cp->free_cpus);
+        return 0;
+}
+/*
+ * cpudl_cleanup - clean up the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+void cpudl_cleanup(struct cpudl *cp)
+{
+        /*
+         * nothing to do for the moment
+         */
+}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000000..a202789a412c
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_CPUDL_H
+#define _LINUX_CPUDL_H
+#include <linux/sched.h>
+#define IDX_INVALID     -1
+struct array_item {
+        u64 dl;
+        int cpu;
+};
+struct cpudl {
+        raw_spinlock_t lock;
+        int size;
+        int cpu_to_idx[NR_CPUS];
+        struct array_item elements[NR_CPUS];
+        cpumask_var_t free_cpus;
+};
+#ifdef CONFIG_SMP
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+               struct cpumask *later_mask);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+int cpudl_init(struct cpudl *cp);
+void cpudl_cleanup(struct cpudl *cp);
+#else
+#define cpudl_set(cp, cpu, dl) do { } while (0)
+#define cpudl_init() do { } while (0)
+#endif /* CONFIG_SMP */
+#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000000..0de248202879
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1640 @@
+/*
+ * Deadline Scheduling Class (SCHED_DEADLINE)
+ *
+ * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
+ *
+ * Tasks that periodically executes their instances for less than their
+ * runtime won't miss any of their deadlines.
+ * Tasks that are not periodic or sporadic or that tries to execute more
+ * than their reserved bandwidth will be slowed down (and may potentially
+ * miss some of their deadlines), and won't affect any other task.
+ *
+ * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
+ *                    Juri Lelli <juri.lelli@gmail.com>,
+ *                    Michael Trimarchi <michael@amarulasolutions.com>,
+ *                    Fabio Checconi <fchecconi@gmail.com>
+ */
+#include "sched.h"
+#include <linux/slab.h>
+struct dl_bandwidth def_dl_bandwidth;
+static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
+{
+        return container_of(dl_se, struct task_struct, dl);
+}
+static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
+{
+        return container_of(dl_rq, struct rq, dl);
+}
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+        struct task_struct *p = dl_task_of(dl_se);
+        struct rq *rq = task_rq(p);
+        return &rq->dl;
+}
+static inline int on_dl_rq(struct sched_dl_entity *dl_se)
+{
+        return !RB_EMPTY_NODE(&dl_se->rb_node);
+}
+static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        return dl_rq->rb_leftmost == &dl_se->rb_node;
+}
+void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
+{
+        raw_spin_lock_init(&dl_b->dl_runtime_lock);
+        dl_b->dl_period = period;
+        dl_b->dl_runtime = runtime;
+}
+extern unsigned long to_ratio(u64 period, u64 runtime);
+void init_dl_bw(struct dl_bw *dl_b)
+{
+        raw_spin_lock_init(&dl_b->lock);
+        raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
+        if (global_rt_runtime() == RUNTIME_INF)
+                dl_b->bw = -1;
+        else
+                dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
+        raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+        dl_b->total_bw = 0;
+}
+void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+{
+        dl_rq->rb_root = RB_ROOT;
+#ifdef CONFIG_SMP
+        /* zero means no -deadline tasks */
+        dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
+        dl_rq->dl_nr_migratory = 0;
+        dl_rq->overloaded = 0;
+        dl_rq->pushable_dl_tasks_root = RB_ROOT;
+#else
+        init_dl_bw(&dl_rq->dl_bw);
+#endif
+}
+#ifdef CONFIG_SMP
+static inline int dl_overloaded(struct rq *rq)
+{
+        return atomic_read(&rq->rd->dlo_count);
+}
+static inline void dl_set_overload(struct rq *rq)
+{
+        if (!rq->online)
+                return;
+        cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
+        /*
+         * Must be visible before the overload count is
+         * set (as in sched_rt.c).
+         *
+         * Matched by the barrier in pull_dl_task().
+         */
+        smp_wmb();
+        atomic_inc(&rq->rd->dlo_count);
+}
+static inline void dl_clear_overload(struct rq *rq)
+{
+        if (!rq->online)
+                return;
+        atomic_dec(&rq->rd->dlo_count);
+        cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
+}
+static void update_dl_migration(struct dl_rq *dl_rq)
+{
+        if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+                if (!dl_rq->overloaded) {
+                        dl_set_overload(rq_of_dl_rq(dl_rq));
+                        dl_rq->overloaded = 1;
+                }
+        } else if (dl_rq->overloaded) {
+                dl_clear_overload(rq_of_dl_rq(dl_rq));
+                dl_rq->overloaded = 0;
+        }
+}
+static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+        struct task_struct *p = dl_task_of(dl_se);
+        dl_rq = &rq_of_dl_rq(dl_rq)->dl;
+        dl_rq->dl_nr_total++;
+        if (p->nr_cpus_allowed > 1)
+                dl_rq->dl_nr_migratory++;
+        update_dl_migration(dl_rq);
+}
+static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+        struct task_struct *p = dl_task_of(dl_se);
+        dl_rq = &rq_of_dl_rq(dl_rq)->dl;
+        dl_rq->dl_nr_total--;
+        if (p->nr_cpus_allowed > 1)
+                dl_rq->dl_nr_migratory--;
+        update_dl_migration(dl_rq);
+}
+/*
+ * The list of pushable -deadline task is not a plist, like in
+ * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
+ */
+static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+        struct dl_rq *dl_rq = &rq->dl;
+        struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+        struct rb_node *parent = NULL;
+        struct task_struct *entry;
+        int leftmost = 1;
+        BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+        while (*link) {
+                parent = *link;
+                entry = rb_entry(parent, struct task_struct,
+                                 pushable_dl_tasks);
+                if (dl_entity_preempt(&p->dl, &entry->dl))
+                        link = &parent->rb_left;
+                else {
+                        link = &parent->rb_right;
+                        leftmost = 0;
+                }
+        }
+        if (leftmost)
+                dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+        rb_link_node(&p->pushable_dl_tasks, parent, link);
+        rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+}
+static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+        struct dl_rq *dl_rq = &rq->dl;
+        if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
+                return;
+        if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+                struct rb_node *next_node;
+                next_node = rb_next(&p->pushable_dl_tasks);
+                dl_rq->pushable_dl_tasks_leftmost = next_node;
+        }
+        rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+        RB_CLEAR_NODE(&p->pushable_dl_tasks);
+}
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+        return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+}
+static int push_dl_task(struct rq *rq);
+#else
+static inline
+void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+static inline
+void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+static inline
+void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+static inline
+void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+#endif /* CONFIG_SMP */
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+                                  int flags);
+/*
+ * We are being explicitly informed that a new instance is starting,
+ * and this means that:
+ *  - the absolute deadline of the entity has to be placed at
+ *    current time + relative deadline;
+ *  - the runtime of the entity has to be set to the maximum value.
+ *
+ * The capability of specifying such event is useful whenever a -deadline
+ * entity wants to (try to!) synchronize its behaviour with the scheduler's
+ * one, and to (try to!) reconcile itself with its own scheduling
+ * parameters.
+ */
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
+                                       struct sched_dl_entity *pi_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+        /*
+         * We use the regular wall clock time to set deadlines in the
+         * future; in fact, we must consider execution overheads (time
+         * spent on hardirq context, etc.).
+         */
+        dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+        dl_se->runtime = pi_se->dl_runtime;
+        dl_se->dl_new = 0;
+}
+/*
+ * Pure Earliest Deadline First (EDF) scheduling does not deal with the
+ * possibility of a entity lasting more than what it declared, and thus
+ * exhausting its runtime.
+ *
+ * Here we are interested in making runtime overrun possible, but we do
+ * not want a entity which is misbehaving to affect the scheduling of all
+ * other entities.
+ * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
+ * is used, in order to confine each entity within its own bandwidth.
+ *
+ * This function deals exactly with that, and ensures that when the runtime
+ * of a entity is replenished, its deadline is also postponed. That ensures
+ * the overrunning entity can't interfere with other entity in the system and
+ * can't make them miss their deadlines. Reasons why this kind of overruns
+ * could happen are, typically, a entity voluntarily trying to overcome its
+ * runtime, or it just underestimated it during sched_setscheduler_ex().
+ */
+static void replenish_dl_entity(struct sched_dl_entity *dl_se,
+                                struct sched_dl_entity *pi_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        BUG_ON(pi_se->dl_runtime <= 0);
+        /*
+         * This could be the case for a !-dl task that is boosted.
+         * Just go with full inherited parameters.
+         */
+        if (dl_se->dl_deadline == 0) {
+                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+                dl_se->runtime = pi_se->dl_runtime;
+        }
+        /*
+         * We keep moving the deadline away until we get some
+         * available runtime for the entity. This ensures correct
+         * handling of situations where the runtime overrun is
+         * arbitrary large.
+         */
+        while (dl_se->runtime <= 0) {
+                dl_se->deadline += pi_se->dl_period;
+                dl_se->runtime += pi_se->dl_runtime;
+        }
+        /*
+         * At this point, the deadline really should be "in
+         * the future" with respect to rq->clock. If it's
+         * not, we are, for some reason, lagging too much!
+         * Anyway, after having warn userspace abut that,
+         * we still try to keep the things running by
+         * resetting the deadline and the budget of the
+         * entity.
+         */
+        if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
+                static bool lag_once = false;
+                if (!lag_once) {
+                        lag_once = true;
+                        printk_sched("sched: DL replenish lagged to much\n");
+                }
+                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+                dl_se->runtime = pi_se->dl_runtime;
+        }
+}
+/*
+ * Here we check if --at time t-- an entity (which is probably being
+ * [re]activated or, in general, enqueued) can use its remaining runtime
+ * and its current deadline _without_ exceeding the bandwidth it is
+ * assigned (function returns true if it can't). We are in fact applying
+ * one of the CBS rules: when a task wakes up, if the residual runtime
+ * over residual deadline fits within the allocated bandwidth, then we
+ * can keep the current (absolute) deadline and residual budget without
+ * disrupting the schedulability of the system. Otherwise, we should
+ * refill the runtime and set the deadline a period in the future,
+ * because keeping the current (absolute) deadline of the task would
+ * result in breaking guarantees promised to other tasks.
+ *
+ * This function returns true if:
+ *
+ *   runtime / (deadline - t) > dl_runtime / dl_period ,
+ *
+ * IOW we can't recycle current parameters.
+ *
+ * Notice that the bandwidth check is done against the period. For
+ * task with deadline equal to period this is the same of using
+ * dl_deadline instead of dl_period in the equation above.
+ */
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
+                               struct sched_dl_entity *pi_se, u64 t)
+{
+        u64 left, right;
+        /*
+         * left and right are the two sides of the equation above,
+         * after a bit of shuffling to use multiplications instead
+         * of divisions.
+         *
+         * Note that none of the time values involved in the two
+         * multiplications are absolute: dl_deadline and dl_runtime
+         * are the relative deadline and the maximum runtime of each
+         * instance, runtime is the runtime left for the last instance
+         * and (deadline - t), since t is rq->clock, is the time left
+         * to the (absolute) deadline. Even if overflowing the u64 type
+         * is very unlikely to occur in both cases, here we scale down
+         * as we want to avoid that risk at all. Scaling down by 10
+         * means that we reduce granularity to 1us. We are fine with it,
+         * since this is only a true/false check and, anyway, thinking
+         * of anything below microseconds resolution is actually fiction
+         * (but still we want to give the user that illusion >;).
+         */
+        left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+        right = ((dl_se->deadline - t) >> DL_SCALE) *
+                (pi_se->dl_runtime >> DL_SCALE);
+        return dl_time_before(right, left);
+}
+/*
+ * When a -deadline entity is queued back on the runqueue, its runtime and
+ * deadline might need updating.
+ *
+ * The policy here is that we update the deadline of the entity only if:
+ *  - the current deadline is in the past,
+ *  - using the remaining runtime with the current deadline would make
+ *    the entity exceed its bandwidth.
+ */
+static void update_dl_entity(struct sched_dl_entity *dl_se,
+                             struct sched_dl_entity *pi_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        /*
+         * The arrival of a new instance needs special treatment, i.e.,
+         * the actual scheduling parameters have to be "renewed".
+         */
+        if (dl_se->dl_new) {
+                setup_new_dl_entity(dl_se, pi_se);
+                return;
+        }
+        if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
+            dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+                dl_se->runtime = pi_se->dl_runtime;
+        }
+}
+/*
+ * If the entity depleted all its runtime, and if we want it to sleep
+ * while waiting for some new execution time to become available, we
+ * set the bandwidth enforcement timer to the replenishment instant
+ * and try to activate it.
+ *
+ * Notice that it is important for the caller to know if the timer
+ * actually started or not (i.e., the replenishment instant is in
+ * the future or in the past).
+ */
+static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        ktime_t now, act;
+        ktime_t soft, hard;
+        unsigned long range;
+        s64 delta;
+        if (boosted)
+                return 0;
+        /*
+         * We want the timer to fire at the deadline, but considering
+         * that it is actually coming from rq->clock and not from
+         * hrtimer's time base reading.
+         */
+        act = ns_to_ktime(dl_se->deadline);
+        now = hrtimer_cb_get_time(&dl_se->dl_timer);
+        delta = ktime_to_ns(now) - rq_clock(rq);
+        act = ktime_add_ns(act, delta);
+        /*
+         * If the expiry time already passed, e.g., because the value
+         * chosen as the deadline is too small, don't even try to
+         * start the timer in the past!
+         */
+        if (ktime_us_delta(act, now) < 0)
+                return 0;
+        hrtimer_set_expires(&dl_se->dl_timer, act);
+        soft = hrtimer_get_softexpires(&dl_se->dl_timer);
+        hard = hrtimer_get_expires(&dl_se->dl_timer);
+        range = ktime_to_ns(ktime_sub(hard, soft));
+        __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
+                                 range, HRTIMER_MODE_ABS, 0);
+        return hrtimer_active(&dl_se->dl_timer);
+}
+/*
+ * This is the bandwidth enforcement timer callback. If here, we know
+ * a task is not on its dl_rq, since the fact that the timer was running
+ * means the task is throttled and needs a runtime replenishment.
+ *
+ * However, what we actually do depends on the fact the task is active,
+ * (it is on its rq) or has been removed from there by a call to
+ * dequeue_task_dl(). In the former case we must issue the runtime
+ * replenishment and add the task back to the dl_rq; in the latter, we just
+ * do nothing but clearing dl_throttled, so that runtime and deadline
+ * updating (and the queueing back to dl_rq) will be done by the
+ * next call to enqueue_task_dl().
+ */
+static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
+{
+        struct sched_dl_entity *dl_se = container_of(timer,
+                                                     struct sched_dl_entity,
+                                                     dl_timer);
+        struct task_struct *p = dl_task_of(dl_se);
+        struct rq *rq = task_rq(p);
+        raw_spin_lock(&rq->lock);
+        /*
+         * We need to take care of a possible races here. In fact, the
+         * task might have changed its scheduling policy to something
+         * different from SCHED_DEADLINE or changed its reservation
+         * parameters (through sched_setscheduler()).
+         */
+        if (!dl_task(p) || dl_se->dl_new)
+                goto unlock;
+        sched_clock_tick();
+        update_rq_clock(rq);
+        dl_se->dl_throttled = 0;
+        if (p->on_rq) {
+                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+                if (task_has_dl_policy(rq->curr))
+                        check_preempt_curr_dl(rq, p, 0);
+                else
+                        resched_task(rq->curr);
+#ifdef CONFIG_SMP
+                /*
+                 * Queueing this task back might have overloaded rq,
+                 * check if we need to kick someone away.
+                 */
+                if (has_pushable_dl_tasks(rq))
+                        push_dl_task(rq);
+#endif
+        }
+unlock:
+        raw_spin_unlock(&rq->lock);
+        return HRTIMER_NORESTART;
+}
+void init_dl_task_timer(struct sched_dl_entity *dl_se)
+{
+        struct hrtimer *timer = &dl_se->dl_timer;
+        if (hrtimer_active(timer)) {
+                hrtimer_try_to_cancel(timer);
+                return;
+        }
+        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        timer->function = dl_task_timer;
+}
+static
+int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+{
+        int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
+        int rorun = dl_se->runtime <= 0;
+        if (!rorun && !dmiss)
+                return 0;
+        /*
+         * If we are beyond our current deadline and we are still
+         * executing, then we have already used some of the runtime of
+         * the next instance. Thus, if we do not account that, we are
+         * stealing bandwidth from the system at each deadline miss!
+         */
+        if (dmiss) {
+                dl_se->runtime = rorun ? dl_se->runtime : 0;
+                dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
+        }
+        return 1;
+}
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+        struct task_struct *curr = rq->curr;
+        struct sched_dl_entity *dl_se = &curr->dl;
+        u64 delta_exec;
+        if (!dl_task(curr) || !on_dl_rq(dl_se))
+                return;
+        /*
+         * Consumed budget is computed considering the time as
+         * observed by schedulable tasks (excluding time spent
+         * in hardirq context, etc.). Deadlines are instead
+         * computed using hard walltime. This seems to be the more
+         * natural solution, but the full ramifications of this
+         * approach need further study.
+         */
+        delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+        if (unlikely((s64)delta_exec < 0))
+                delta_exec = 0;
+        schedstat_set(curr->se.statistics.exec_max,
+                      max(curr->se.statistics.exec_max, delta_exec));
+        curr->se.sum_exec_runtime += delta_exec;
+        account_group_exec_runtime(curr, delta_exec);
+        curr->se.exec_start = rq_clock_task(rq);
+        cpuacct_charge(curr, delta_exec);
+        sched_rt_avg_update(rq, delta_exec);
+        dl_se->runtime -= delta_exec;
+        if (dl_runtime_exceeded(rq, dl_se)) {
+                __dequeue_task_dl(rq, curr, 0);
+                if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
+                        dl_se->dl_throttled = 1;
+                else
+                        enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+                if (!is_leftmost(curr, &rq->dl))
+                        resched_task(curr);
+        }
+        /*
+         * Because -- for now -- we share the rt bandwidth, we need to
+         * account our runtime there too, otherwise actual rt tasks
+         * would be able to exceed the shared quota.
+         *
+         * Account to the root rt group for now.
+         *
+         * The solution we're working towards is having the RT groups scheduled
+         * using deadline servers -- however there's a few nasties to figure
+         * out before that can happen.
+         */
+        if (rt_bandwidth_enabled()) {
+                struct rt_rq *rt_rq = &rq->rt;
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
+                rt_rq->rt_time += delta_exec;
+                /*
+                 * We'll let actual RT tasks worry about the overflow here, we
+                 * have our own CBS to keep us inline -- see above.
+                 */
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
+        }
+}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
+static inline u64 next_deadline(struct rq *rq)
+{
+        struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
+        if (next && dl_prio(next->prio))
+                return next->dl.deadline;
+        else
+                return 0;
+}
+static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        if (dl_rq->earliest_dl.curr == 0 ||
+            dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+                /*
+                 * If the dl_rq had no -deadline tasks, or if the new task
+                 * has shorter deadline than the current one on dl_rq, we
+                 * know that the previous earliest becomes our next earliest,
+                 * as the new task becomes the earliest itself.
+                 */
+                dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
+                dl_rq->earliest_dl.curr = deadline;
+                cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+        } else if (dl_rq->earliest_dl.next == 0 ||
+                   dl_time_before(deadline, dl_rq->earliest_dl.next)) {
+                /*
+                 * On the other hand, if the new -deadline task has a
+                 * a later deadline than the earliest one on dl_rq, but
+                 * it is earlier than the next (if any), we must
+                 * recompute the next-earliest.
+                 */
+                dl_rq->earliest_dl.next = next_deadline(rq);
+        }
+}
+static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        /*
+         * Since we may have removed our earliest (and/or next earliest)
+         * task we must recompute them.
+         */
+        if (!dl_rq->dl_nr_running) {
+                dl_rq->earliest_dl.curr = 0;
+                dl_rq->earliest_dl.next = 0;
+                cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+        } else {
+                struct rb_node *leftmost = dl_rq->rb_leftmost;
+                struct sched_dl_entity *entry;
+                entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
+                dl_rq->earliest_dl.curr = entry->deadline;
+                dl_rq->earliest_dl.next = next_deadline(rq);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+        }
+}
+#else
+static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+#endif /* CONFIG_SMP */
+static inline
+void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+        int prio = dl_task_of(dl_se)->prio;
+        u64 deadline = dl_se->deadline;
+        WARN_ON(!dl_prio(prio));
+        dl_rq->dl_nr_running++;
+        inc_dl_deadline(dl_rq, deadline);
+        inc_dl_migration(dl_se, dl_rq);
+}
+static inline
+void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+        int prio = dl_task_of(dl_se)->prio;
+        WARN_ON(!dl_prio(prio));
+        WARN_ON(!dl_rq->dl_nr_running);
+        dl_rq->dl_nr_running--;
+        dec_dl_deadline(dl_rq, dl_se->deadline);
+        dec_dl_migration(dl_se, dl_rq);
+}
+static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rb_node **link = &dl_rq->rb_root.rb_node;
+        struct rb_node *parent = NULL;
+        struct sched_dl_entity *entry;
+        int leftmost = 1;
+        BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+        while (*link) {
+                parent = *link;
+                entry = rb_entry(parent, struct sched_dl_entity, rb_node);
+                if (dl_time_before(dl_se->deadline, entry->deadline))
+                        link = &parent->rb_left;
+                else {
+                        link = &parent->rb_right;
+                        leftmost = 0;
+                }
+        }
+        if (leftmost)
+                dl_rq->rb_leftmost = &dl_se->rb_node;
+        rb_link_node(&dl_se->rb_node, parent, link);
+        rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+        inc_dl_tasks(dl_se, dl_rq);
+}
+static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        if (RB_EMPTY_NODE(&dl_se->rb_node))
+                return;
+        if (dl_rq->rb_leftmost == &dl_se->rb_node) {
+                struct rb_node *next_node;
+                next_node = rb_next(&dl_se->rb_node);
+                dl_rq->rb_leftmost = next_node;
+        }
+        rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+        RB_CLEAR_NODE(&dl_se->rb_node);
+        dec_dl_tasks(dl_se, dl_rq);
+}
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se,
+                  struct sched_dl_entity *pi_se, int flags)
+{
+        BUG_ON(on_dl_rq(dl_se));
+        /*
+         * If this is a wakeup or a new instance, the scheduling
+         * parameters of the task might need updating. Otherwise,
+         * we want a replenishment of its runtime.
+         */
+        if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
+                replenish_dl_entity(dl_se, pi_se);
+        else
+                update_dl_entity(dl_se, pi_se);
+        __enqueue_dl_entity(dl_se);
+}
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+        __dequeue_dl_entity(dl_se);
+}
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+        struct task_struct *pi_task = rt_mutex_get_top_task(p);
+        struct sched_dl_entity *pi_se = &p->dl;
+        /*
+         * Use the scheduling parameters of the top pi-waiter
+         * task if we have one and its (relative) deadline is
+         * smaller than our one... OTW we keep our runtime and
+         * deadline.
+         */
+        if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+                pi_se = &pi_task->dl;
+        /*
+         * If p is throttled, we do nothing. In fact, if it exhausted
+         * its budget it needs a replenishment and, since it now is on
+         * its rq, the bandwidth timer callback (which clearly has not
+         * run yet) will take care of this.
+         */
+        if (p->dl.dl_throttled)
+                return;
+        enqueue_dl_entity(&p->dl, pi_se, flags);
+        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+                enqueue_pushable_dl_task(rq, p);
+        inc_nr_running(rq);
+}
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+        dequeue_dl_entity(&p->dl);
+        dequeue_pushable_dl_task(rq, p);
+}
+static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+        update_curr_dl(rq);
+        __dequeue_task_dl(rq, p, flags);
+        dec_nr_running(rq);
+}
+/*
+ * Yield task semantic for -deadline tasks is:
+ *
+ *   get off from the CPU until our next instance, with
+ *   a new runtime. This is of little use now, since we
+ *   don't have a bandwidth reclaiming mechanism. Anyway,
+ *   bandwidth reclaiming is planned for the future, and
+ *   yield_task_dl will indicate that some spare budget
+ *   is available for other task instances to use it.
+ */
+static void yield_task_dl(struct rq *rq)
+{
+        struct task_struct *p = rq->curr;
+        /*
+         * We make the task go to sleep until its current deadline by
+         * forcing its runtime to zero. This way, update_curr_dl() stops
+         * it and the bandwidth timer will wake it up and will give it
+         * new scheduling parameters (thanks to dl_new=1).
+         */
+        if (p->dl.runtime > 0) {
+                rq->curr->dl.dl_new = 1;
+                p->dl.runtime = 0;
+        }
+        update_curr_dl(rq);
+}
+#ifdef CONFIG_SMP
+static int find_later_rq(struct task_struct *task);
+static int
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+        struct task_struct *curr;
+        struct rq *rq;
+        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+                goto out;
+        rq = cpu_rq(cpu);
+        rcu_read_lock();
+        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+        /*
+         * If we are dealing with a -deadline task, we must
+         * decide where to wake it up.
+         * If it has a later deadline and the current task
+         * on this rq can't move (provided the waking task
+         * can!) we prefer to send it somewhere else. On the
+         * other hand, if it has a shorter deadline, we
+         * try to make it stay here, it might be important.
+         */
+        if (unlikely(dl_task(curr)) &&
+            (curr->nr_cpus_allowed < 2 ||
+             !dl_entity_preempt(&p->dl, &curr->dl)) &&
+            (p->nr_cpus_allowed > 1)) {
+                int target = find_later_rq(p);
+                if (target != -1)
+                        cpu = target;
+        }
+        rcu_read_unlock();
+out:
+        return cpu;
+}
+static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
+{
+        /*
+         * Current can't be migrated, useless to reschedule,
+         * let's hope p can move out.
+         */
+        if (rq->curr->nr_cpus_allowed == 1 ||
+            cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+                return;
+        /*
+         * p is migratable, so let's not schedule it and
+         * see if it is pushed or pulled somewhere else.
+         */
+        if (p->nr_cpus_allowed != 1 &&
+            cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+                return;
+        resched_task(rq->curr);
+}
+#endif /* CONFIG_SMP */
+/*
+ * Only called when both the current and waking task are -deadline
+ * tasks.
+ */
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+                                  int flags)
+{
+        if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+                resched_task(rq->curr);
+                return;
+        }
+#ifdef CONFIG_SMP
+        /*
+         * In the unlikely case current and p have the same deadline
+         * let us try to decide what's the best thing to do...
+         */
+        if ((p->dl.deadline == rq->curr->dl.deadline) &&
+            !test_tsk_need_resched(rq->curr))
+                check_preempt_equal_dl(rq, p);
+#endif /* CONFIG_SMP */
+}
+#ifdef CONFIG_SCHED_HRTICK
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+        s64 delta = p->dl.dl_runtime - p->dl.runtime;
+        if (delta > 10000)
+                hrtick_start(rq, p->dl.runtime);
+}
+#endif
+static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
+                                                   struct dl_rq *dl_rq)
+{
+        struct rb_node *left = dl_rq->rb_leftmost;
+        if (!left)
+                return NULL;
+        return rb_entry(left, struct sched_dl_entity, rb_node);
+}
+struct task_struct *pick_next_task_dl(struct rq *rq)
+{
+        struct sched_dl_entity *dl_se;
+        struct task_struct *p;
+        struct dl_rq *dl_rq;
+        dl_rq = &rq->dl;
+        if (unlikely(!dl_rq->dl_nr_running))
+                return NULL;
+        dl_se = pick_next_dl_entity(rq, dl_rq);
+        BUG_ON(!dl_se);
+        p = dl_task_of(dl_se);
+        p->se.exec_start = rq_clock_task(rq);
+        /* Running task will never be pushed. */
+       dequeue_pushable_dl_task(rq, p);
+#ifdef CONFIG_SCHED_HRTICK
+        if (hrtick_enabled(rq))
+                start_hrtick_dl(rq, p);
+#endif
+#ifdef CONFIG_SMP
+        rq->post_schedule = has_pushable_dl_tasks(rq);
+#endif /* CONFIG_SMP */
+        return p;
+}
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+{
+        update_curr_dl(rq);
+        if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+                enqueue_pushable_dl_task(rq, p);
+}
+static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
+{
+        update_curr_dl(rq);
+#ifdef CONFIG_SCHED_HRTICK
+        if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+                start_hrtick_dl(rq, p);
+#endif
+}
+static void task_fork_dl(struct task_struct *p)
+{
+        /*
+         * SCHED_DEADLINE tasks cannot fork and this is achieved through
+         * sched_fork()
+         */
+}
+static void task_dead_dl(struct task_struct *p)
+{
+        struct hrtimer *timer = &p->dl.dl_timer;
+        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+        /*
+         * Since we are TASK_DEAD we won't slip out of the domain!
+         */
+        raw_spin_lock_irq(&dl_b->lock);
+        dl_b->total_bw -= p->dl.dl_bw;
+        raw_spin_unlock_irq(&dl_b->lock);
+        hrtimer_cancel(timer);
+}
+static void set_curr_task_dl(struct rq *rq)
+{
+        struct task_struct *p = rq->curr;
+        p->se.exec_start = rq_clock_task(rq);
+        /* You can't push away the running task */
+        dequeue_pushable_dl_task(rq, p);
+}
+#ifdef CONFIG_SMP
+/* Only try algorithms three times */
+#define DL_MAX_TRIES 3
+static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+        if (!task_running(rq, p) &&
+            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+            (p->nr_cpus_allowed > 1))
+                return 1;
+        return 0;
+}
+/* Returns the second earliest -deadline task, NULL otherwise */
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
+{
+        struct rb_node *next_node = rq->dl.rb_leftmost;
+        struct sched_dl_entity *dl_se;
+        struct task_struct *p = NULL;
+next_node:
+        next_node = rb_next(next_node);
+        if (next_node) {
+                dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
+                p = dl_task_of(dl_se);
+                if (pick_dl_task(rq, p, cpu))
+                        return p;
+                goto next_node;
+        }
+        return NULL;
+}
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
+static int find_later_rq(struct task_struct *task)
+{
+        struct sched_domain *sd;
+        struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
+        int this_cpu = smp_processor_id();
+        int best_cpu, cpu = task_cpu(task);
+        /* Make sure the mask is initialized first */
+        if (unlikely(!later_mask))
+                return -1;
+        if (task->nr_cpus_allowed == 1)
+                return -1;
+        best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
+                        task, later_mask);
+        if (best_cpu == -1)
+                return -1;
+        /*
+         * If we are here, some target has been found,
+         * the most suitable of which is cached in best_cpu.
+         * This is, among the runqueues where the current tasks
+         * have later deadlines than the task's one, the rq
+         * with the latest possible one.
+         *
+         * Now we check how well this matches with task's
+         * affinity and system topology.
+         *
+         * The last cpu where the task run is our first
+         * guess, since it is most likely cache-hot there.
+         */
+        if (cpumask_test_cpu(cpu, later_mask))
+                return cpu;
+        /*
+         * Check if this_cpu is to be skipped (i.e., it is
+         * not in the mask) or not.
+         */
+        if (!cpumask_test_cpu(this_cpu, later_mask))
+                this_cpu = -1;
+        rcu_read_lock();
+        for_each_domain(cpu, sd) {
+                if (sd->flags & SD_WAKE_AFFINE) {
+                        /*
+                         * If possible, preempting this_cpu is
+                         * cheaper than migrating.
+                         */
+                        if (this_cpu != -1 &&
+                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+                                rcu_read_unlock();
+                                return this_cpu;
+                        }
+                        /*
+                         * Last chance: if best_cpu is valid and is
+                         * in the mask, that becomes our choice.
+                         */
+                        if (best_cpu < nr_cpu_ids &&
+                            cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+                                rcu_read_unlock();
+                                return best_cpu;
+                        }
+                }
+        }
+        rcu_read_unlock();
+        /*
+         * At this point, all our guesses failed, we just return
+         * 'something', and let the caller sort the things out.
+         */
+        if (this_cpu != -1)
+                return this_cpu;
+        cpu = cpumask_any(later_mask);
+        if (cpu < nr_cpu_ids)
+                return cpu;
+        return -1;
+}
+/* Locks the rq it finds */
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
+{
+        struct rq *later_rq = NULL;
+        int tries;
+        int cpu;
+        for (tries = 0; tries < DL_MAX_TRIES; tries++) {
+                cpu = find_later_rq(task);
+                if ((cpu == -1) || (cpu == rq->cpu))
+                        break;
+                later_rq = cpu_rq(cpu);
+                /* Retry if something changed. */
+                if (double_lock_balance(rq, later_rq)) {
+                        if (unlikely(task_rq(task) != rq ||
+                                     !cpumask_test_cpu(later_rq->cpu,
+                                                       &task->cpus_allowed) ||
+                                     task_running(rq, task) || !task->on_rq)) {
+                                double_unlock_balance(rq, later_rq);
+                                later_rq = NULL;
+                                break;
+                        }
+                }
+                /*
+                 * If the rq we found has no -deadline task, or
+                 * its earliest one has a later deadline than our
+                 * task, the rq is a good one.
+                 */
+                if (!later_rq->dl.dl_nr_running ||
+                    dl_time_before(task->dl.deadline,
+                                   later_rq->dl.earliest_dl.curr))
+                        break;
+                /* Otherwise we try again. */
+                double_unlock_balance(rq, later_rq);
+                later_rq = NULL;
+        }
+        return later_rq;
+}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+        struct task_struct *p;
+        if (!has_pushable_dl_tasks(rq))
+                return NULL;
+        p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+                     struct task_struct, pushable_dl_tasks);
+        BUG_ON(rq->cpu != task_cpu(p));
+        BUG_ON(task_current(rq, p));
+        BUG_ON(p->nr_cpus_allowed <= 1);
+        BUG_ON(!p->on_rq);
+        BUG_ON(!dl_task(p));
+        return p;
+}
+/*
+ * See if the non running -deadline tasks on this rq
+ * can be sent to some other CPU where they can preempt
+ * and start executing.
+ */
+static int push_dl_task(struct rq *rq)
+{
+        struct task_struct *next_task;
+        struct rq *later_rq;
+        if (!rq->dl.overloaded)
+                return 0;
+        next_task = pick_next_pushable_dl_task(rq);
+        if (!next_task)
+                return 0;
+retry:
+        if (unlikely(next_task == rq->curr)) {
+                WARN_ON(1);
+                return 0;
+        }
+        /*
+         * If next_task preempts rq->curr, and rq->curr
+         * can move away, it makes sense to just reschedule
+         * without going further in pushing next_task.
+         */
+        if (dl_task(rq->curr) &&
+            dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+            rq->curr->nr_cpus_allowed > 1) {
+                resched_task(rq->curr);
+                return 0;
+        }
+        /* We might release rq lock */
+        get_task_struct(next_task);
+        /* Will lock the rq it'll find */
+        later_rq = find_lock_later_rq(next_task, rq);
+        if (!later_rq) {
+                struct task_struct *task;
+                /*
+                 * We must check all this again, since
+                 * find_lock_later_rq releases rq->lock and it is
+                 * then possible that next_task has migrated.
+                 */
+                task = pick_next_pushable_dl_task(rq);
+                if (task_cpu(next_task) == rq->cpu && task == next_task) {
+                        /*
+                         * The task is still there. We don't try
+                         * again, some other cpu will pull it when ready.
+                         */
+                        dequeue_pushable_dl_task(rq, next_task);
+                        goto out;
+                }
+                if (!task)
+                        /* No more tasks */
+                        goto out;
+                put_task_struct(next_task);
+                next_task = task;
+                goto retry;
+        }
+        deactivate_task(rq, next_task, 0);
+        set_task_cpu(next_task, later_rq->cpu);
+        activate_task(later_rq, next_task, 0);
+        resched_task(later_rq->curr);
+        double_unlock_balance(rq, later_rq);
+out:
+        put_task_struct(next_task);
+        return 1;
+}
+static void push_dl_tasks(struct rq *rq)
+{
+        /* Terminates as it moves a -deadline task */
+        while (push_dl_task(rq))
+                ;
+}
+static int pull_dl_task(struct rq *this_rq)
+{
+        int this_cpu = this_rq->cpu, ret = 0, cpu;
+        struct task_struct *p;
+        struct rq *src_rq;
+        u64 dmin = LONG_MAX;
+        if (likely(!dl_overloaded(this_rq)))
+                return 0;
+        /*
+         * Match the barrier from dl_set_overloaded; this guarantees that if we
+         * see overloaded we must also see the dlo_mask bit.
+         */
+        smp_rmb();
+        for_each_cpu(cpu, this_rq->rd->dlo_mask) {
+                if (this_cpu == cpu)
+                        continue;
+                src_rq = cpu_rq(cpu);
+                /*
+                 * It looks racy, abd it is! However, as in sched_rt.c,
+                 * we are fine with this.
+                 */
+                if (this_rq->dl.dl_nr_running &&
+                    dl_time_before(this_rq->dl.earliest_dl.curr,
+                                   src_rq->dl.earliest_dl.next))
+                        continue;
+                /* Might drop this_rq->lock */
+                double_lock_balance(this_rq, src_rq);
+                /*
+                 * If there are no more pullable tasks on the
+                 * rq, we're done with it.
+                 */
+                if (src_rq->dl.dl_nr_running <= 1)
+                        goto skip;
+                p = pick_next_earliest_dl_task(src_rq, this_cpu);
+                /*
+                 * We found a task to be pulled if:
+                 *  - it preempts our current (if there's one),
+                 *  - it will preempt the last one we pulled (if any).
+                 */
+                if (p && dl_time_before(p->dl.deadline, dmin) &&
+                    (!this_rq->dl.dl_nr_running ||
+                     dl_time_before(p->dl.deadline,
+                                    this_rq->dl.earliest_dl.curr))) {
+                        WARN_ON(p == src_rq->curr);
+                        WARN_ON(!p->on_rq);
+                        /*
+                         * Then we pull iff p has actually an earlier
+                         * deadline than the current task of its runqueue.
+                         */
+                        if (dl_time_before(p->dl.deadline,
+                                           src_rq->curr->dl.deadline))
+                                goto skip;
+                        ret = 1;
+                        deactivate_task(src_rq, p, 0);
+                        set_task_cpu(p, this_cpu);
+                        activate_task(this_rq, p, 0);
+                        dmin = p->dl.deadline;
+                        /* Is there any other task even earlier? */
+                }
+skip:
+                double_unlock_balance(this_rq, src_rq);
+        }
+        return ret;
+}
+static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
+{
+        /* Try to pull other tasks here */
+        if (dl_task(prev))
+                pull_dl_task(rq);
+}
+static void post_schedule_dl(struct rq *rq)
+{
+        push_dl_tasks(rq);
+}
+/*
+ * Since the task is not running and a reschedule is not going to happen
+ * anytime soon on its runqueue, we try pushing it away now.
+ */
+static void task_woken_dl(struct rq *rq, struct task_struct *p)
+{
+        if (!task_running(rq, p) &&
+            !test_tsk_need_resched(rq->curr) &&
+            has_pushable_dl_tasks(rq) &&
+            p->nr_cpus_allowed > 1 &&
+            dl_task(rq->curr) &&
+            (rq->curr->nr_cpus_allowed < 2 ||
+             dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+                push_dl_tasks(rq);
+        }
+}
+static void set_cpus_allowed_dl(struct task_struct *p,
+                                const struct cpumask *new_mask)
+{
+        struct rq *rq;
+        int weight;
+        BUG_ON(!dl_task(p));
+        /*
+         * Update only if the task is actually running (i.e.,
+         * it is on the rq AND it is not throttled).
+         */
+        if (!on_dl_rq(&p->dl))
+                return;
+        weight = cpumask_weight(new_mask);
+        /*
+         * Only update if the process changes its state from whether it
+         * can migrate or not.
+         */
+        if ((p->nr_cpus_allowed > 1) == (weight > 1))
+                return;
+        rq = task_rq(p);
+        /*
+         * The process used to be able to migrate OR it can now migrate
+         */
+        if (weight <= 1) {
+                if (!task_current(rq, p))
+                        dequeue_pushable_dl_task(rq, p);
+                BUG_ON(!rq->dl.dl_nr_migratory);
+                rq->dl.dl_nr_migratory--;
+        } else {
+                if (!task_current(rq, p))
+                        enqueue_pushable_dl_task(rq, p);
+                rq->dl.dl_nr_migratory++;
+        }
+        update_dl_migration(&rq->dl);
+}
+/* Assumes rq->lock is held */
+static void rq_online_dl(struct rq *rq)
+{
+        if (rq->dl.overloaded)
+                dl_set_overload(rq);
+        if (rq->dl.dl_nr_running > 0)
+                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+}
+/* Assumes rq->lock is held */
+static void rq_offline_dl(struct rq *rq)
+{
+        if (rq->dl.overloaded)
+                dl_clear_overload(rq);
+        cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+}
+void init_sched_dl_class(void)
+{
+        unsigned int i;
+        for_each_possible_cpu(i)
+                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
+                                        GFP_KERNEL, cpu_to_node(i));
+}
+#endif /* CONFIG_SMP */
+static void switched_from_dl(struct rq *rq, struct task_struct *p)
+{
+        if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
+                hrtimer_try_to_cancel(&p->dl.dl_timer);
+#ifdef CONFIG_SMP
+        /*
+         * Since this might be the only -deadline task on the rq,
+         * this is the right place to try to pull some other one
+         * from an overloaded cpu, if any.
+         */
+        if (!rq->dl.dl_nr_running)
+                pull_dl_task(rq);
+#endif
+}
+/*
+ * When switching to -deadline, we may overload the rq, then
+ * we try to push someone off, if possible.
+ */
+static void switched_to_dl(struct rq *rq, struct task_struct *p)
+{
+        int check_resched = 1;
+        /*
+         * If p is throttled, don't consider the possibility
+         * of preempting rq->curr, the check will be done right
+         * after its runtime will get replenished.
+         */
+        if (unlikely(p->dl.dl_throttled))
+                return;
+        if (p->on_rq || rq->curr != p) {
+#ifdef CONFIG_SMP
+                if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+                        /* Only reschedule if pushing failed */
+                        check_resched = 0;
+#endif /* CONFIG_SMP */
+                if (check_resched && task_has_dl_policy(rq->curr))
+                        check_preempt_curr_dl(rq, p, 0);
+        }
+}
+/*
+ * If the scheduling parameters of a -deadline task changed,
+ * a push or pull operation might be needed.
+ */
+static void prio_changed_dl(struct rq *rq, struct task_struct *p,
+                            int oldprio)
+{
+        if (p->on_rq || rq->curr == p) {
+#ifdef CONFIG_SMP
+                /*
+                 * This might be too much, but unfortunately
+                 * we don't have the old deadline value, and
+                 * we can't argue if the task is increasing
+                 * or lowering its prio, so...
+                 */
+                if (!rq->dl.overloaded)
+                        pull_dl_task(rq);
+                /*
+                 * If we now have a earlier deadline task than p,
+                 * then reschedule, provided p is still on this
+                 * runqueue.
+                 */
+                if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
+                    rq->curr == p)
+                        resched_task(p);
+#else
+                /*
+                 * Again, we don't know if p has a earlier
+                 * or later deadline, so let's blindly set a
+                 * (maybe not needed) rescheduling point.
+                 */
+                resched_task(p);
+#endif /* CONFIG_SMP */
+        } else
+                switched_to_dl(rq, p);
+}
+const struct sched_class dl_sched_class = {
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_dl,
+        .dequeue_task           = dequeue_task_dl,
+        .yield_task             = yield_task_dl,
+        .check_preempt_curr     = check_preempt_curr_dl,
+        .pick_next_task         = pick_next_task_dl,
+        .put_prev_task          = put_prev_task_dl,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_dl,
+        .set_cpus_allowed       = set_cpus_allowed_dl,
+        .rq_online              = rq_online_dl,
+        .rq_offline             = rq_offline_dl,
+        .pre_schedule           = pre_schedule_dl,
+        .post_schedule          = post_schedule_dl,
+        .task_woken             = task_woken_dl,
+#endif
+        .set_curr_task          = set_curr_task_dl,
+        .task_tick              = task_tick_dl,
+        .task_fork              = task_fork_dl,
+        .task_dead              = task_dead_dl,
+        .prio_changed           = prio_changed_dl,
+        .switched_from          = switched_from_dl,
+        .switched_to            = switched_to_dl,
+};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5c34d1817e8f..dd52e7ffb10e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 #ifdef CONFIG_NUMA_BALANCING
-        SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
+        SEQ_printf(m, " %d", task_node(p));
 #endif
 #ifdef CONFIG_CGROUP_SCHED
        SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)
        PN(cpu_clk);
        P(jiffies);
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-        P(sched_clock_stable);
+        P(sched_clock_stable());
 #endif
 #undef PN
 #undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652ebe027..867b0a4b0893 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
        update_sysctl();
 }
-#if BITS_PER_LONG == 32
+#define WMULT_CONST     (~0U)
-# define WMULT_CONST    (~0UL)
-#else
-# define WMULT_CONST    (1UL << 32)
-#endif
 #define WMULT_SHIFT     32
-/*
+static void __update_inv_weight(struct load_weight *lw)
- * Shift right and round:
+{
- */
+        unsigned long w;
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+        if (likely(lw->inv_weight))
+                return;
+        w = scale_load_down(lw->weight);
+        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                lw->inv_weight = 1;
+        else if (unlikely(!w))
+                lw->inv_weight = WMULT_CONST;
+        else
+                lw->inv_weight = WMULT_CONST / w;
+}
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
 */
-static unsigned long
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-                struct load_weight *lw)
 {
-        u64 tmp;
+        u64 fact = scale_load_down(weight);
+        int shift = WMULT_SHIFT;
-        /*
+        __update_inv_weight(lw);
-         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-         * 2^SCHED_LOAD_RESOLUTION.
-         */
-        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-                tmp = (u64)delta_exec * scale_load_down(weight);
-        else
-                tmp = (u64)delta_exec;
-        if (!lw->inv_weight) {
-                unsigned long w = scale_load_down(lw->weight);
-                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+        if (unlikely(fact >> 32)) {
-                        lw->inv_weight = 1;
+                while (fact >> 32) {
-                else if (unlikely(!w))
+                        fact >>= 1;
-                        lw->inv_weight = WMULT_CONST;
+                        shift--;
-                else
+                }
-                        lw->inv_weight = WMULT_CONST / w;
        }
-        /*
+        /* hint to use a 32x32->64 mul */
-         * Check whether we'd overflow the 64-bit multiplication:
+        fact = (u64)(u32)fact * lw->inv_weight;
-         */
-        if (unlikely(tmp > WMULT_CONST))
+        while (fact >> 32) {
-                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+                fact >>= 1;
-                        WMULT_SHIFT/2);
+                shift--;
-        else
+        }
-                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+        return mul_u64_u32_shr(delta_exec, fact, shift);
 }
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
 * delta /= w
 */
-static inline unsigned long
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
        if (unlikely(se->load.weight != NICE_0_LOAD))
-                delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
        return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
                        update_load_add(&lw, se->load.weight);
                        load = &lw;
                }
-                slice = calc_delta_mine(slice, se->load.weight, load);
+                slice = __calc_delta(slice, se->load.weight, load);
        }
        return slice;
 }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 /*
- * Update the current task's runtime statistics. Skip current tasks that
+ * Update the current task's runtime statistics.
- * are not in our scheduling class.
 */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-              unsigned long delta_exec)
-{
-        unsigned long delta_exec_weighted;
-        schedstat_set(curr->statistics.exec_max,
-                      max((u64)delta_exec, curr->statistics.exec_max));
-        curr->sum_exec_runtime += delta_exec;
-        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-        curr->vruntime += delta_exec_weighted;
-        update_min_vruntime(cfs_rq);
-}
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
        u64 now = rq_clock_task(rq_of(cfs_rq));
-        unsigned long delta_exec;
+        u64 delta_exec;
        if (unlikely(!curr))
                return;
-        /*
+        delta_exec = now - curr->exec_start;
-         * Get the amount of time the current task was running
+        if (unlikely((s64)delta_exec <= 0))
-         * since the last time we changed load (this cannot
-         * overflow on 32 bits):
-         */
-        delta_exec = (unsigned long)(now - curr->exec_start);
-        if (!delta_exec)
                return;
-        __update_curr(cfs_rq, curr, delta_exec);
        curr->exec_start = now;
+        schedstat_set(curr->statistics.exec_max,
+                      max(delta_exec, curr->statistics.exec_max));
+        curr->sum_exec_runtime += delta_exec;
+        schedstat_add(cfs_rq, exec_clock, delta_exec);
+        curr->vruntime += calc_delta_fair(delta_exec, curr);
+        update_min_vruntime(cfs_rq);
        if (entity_is_task(curr)) {
                struct task_struct *curtask = task_of(curr);
@@ -886,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
        return max(smin, smax);
 }
-/*
- * Once a preferred node is selected the scheduler balancer will prefer moving
- * a task to that node for sysctl_numa_balancing_settle_count number of PTE
- * scans. This will give the process the chance to accumulate more faults on
- * the preferred node but still allow the scheduler to move the task again if
- * the nodes CPUs are overloaded.
- */
-unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
        rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -944,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
        if (!p->numa_group)
                return 0;
-        return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
+        return p->numa_group->faults[task_faults_idx(nid, 0)] +
+                p->numa_group->faults[task_faults_idx(nid, 1)];
 }
 /*
@@ -1037,7 +1015,7 @@ struct task_numa_env {
        struct numa_stats src_stats, dst_stats;
-        int imbalance_pct, idx;
+        int imbalance_pct;
        struct task_struct *best_task;
        long best_imp;
@@ -1225,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
         * elsewhere, so there is no point in (re)trying.
         */
        if (unlikely(!sd)) {
-                p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+                p->numa_preferred_nid = task_node(p);
                return -EINVAL;
        }
@@ -1272,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
        p->numa_scan_period = task_scan_min(p);
        if (env.best_task == NULL) {
-                int ret = migrate_task_to(p, env.best_cpu);
+                ret = migrate_task_to(p, env.best_cpu);
+                if (ret != 0)
+                        trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
                return ret;
        }
        ret = migrate_swap(p, env.best_task);
+        if (ret != 0)
+                trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
        put_task_struct(env.best_task);
        return ret;
 }
@@ -1292,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p)
        p->numa_migrate_retry = jiffies + HZ;
        /* Success if task is already running on preferred CPU */
-        if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
+        if (task_node(p) == p->numa_preferred_nid)
                return;
        /* Otherwise, try migrate to a CPU on the preferred node */
@@ -1364,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p,
                 * scanning faster if shared accesses dominate as it may
                 * simply bounce migrations uselessly
                 */
-                period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
        }
@@ -1752,6 +1733,13 @@ void task_numa_work(struct callback_head *work)
                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                        continue;
+                /*
+                 * Skip inaccessible VMAs to avoid any confusion between
+                 * PROT_NONE and NUMA hinting ptes
+                 */
+                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                        continue;
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -3015,8 +3003,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        }
 }
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
-                                     unsigned long delta_exec)
 {
        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3021,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
@@ -3574,8 +3561,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
        return rq_clock_task(rq_of(cfs_rq));
 }
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -3932,7 +3918,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent || !wl) /* the trivial, non-cgroup case */
+        if (!tg->parent)        /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
@@ -4110,12 +4096,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                  int this_cpu, int load_idx)
+                  int this_cpu, int sd_flag)
 {
        struct sched_group *idlest = NULL, *group = sd->groups;
        unsigned long min_load = ULONG_MAX, this_load = 0;
+        int load_idx = sd->forkexec_idx;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
+        if (sd_flag & SD_BALANCE_WAKE)
+                load_idx = sd->wake_idx;
        do {
                unsigned long load, avg_load;
                int local_group;
@@ -4283,7 +4273,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        }
        while (sd) {
-                int load_idx = sd->forkexec_idx;
                struct sched_group *group;
                int weight;
@@ -4292,10 +4281,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                        continue;
                }
-                if (sd_flag & SD_BALANCE_WAKE)
+                group = find_idlest_group(sd, p, cpu, sd_flag);
-                        load_idx = sd->wake_idx;
-                group = find_idlest_group(sd, p, cpu, load_idx);
                if (!group) {
                        sd = sd->child;
                        continue;
@@ -5379,10 +5365,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 */
                for_each_cpu(cpu, sched_group_cpus(sdg)) {
-                        struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+                        struct sched_group_power *sgp;
+                        struct rq *rq = cpu_rq(cpu);
+                        /*
+                         * build_sched_domains() -> init_sched_groups_power()
+                         * gets here before we've attached the domains to the
+                         * runqueues.
+                         *
+                         * Use power_of(), which is set irrespective of domains
+                         * in update_cpu_power().
+                         *
+                         * This avoids power/power_orig from being 0 and
+                         * causing divide-by-zero issues on boot.
+                         *
+                         * Runtime updates will correct power_orig.
+                         */
+                        if (unlikely(!rq->sd)) {
+                                power_orig += power_of(cpu);
+                                power += power_of(cpu);
+                                continue;
+                        }
-                        power_orig += sg->sgp->power_orig;
+                        sgp = rq->sd->groups->sgp;
-                        power += sg->sgp->power;
+                        power_orig += sgp->power_orig;
+                        power += sgp->power;
                }
        } else  {
                /*
@@ -5500,7 +5507,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
                        int local_group, struct sg_lb_stats *sgs)
 {
-        unsigned long nr_running;
        unsigned long load;
        int i;
@@ -5509,8 +5515,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
-                nr_running = rq->nr_running;
                /* Bias balancing toward cpus of our domain */
                if (local_group)
                        load = target_load(i, load_idx);
@@ -5518,7 +5522,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
                sgs->group_load += load;
-                sgs->sum_nr_running += nr_running;
+                sgs->sum_nr_running += rq->nr_running;
 #ifdef CONFIG_NUMA_BALANCING
                sgs->nr_numa_running += rq->nr_numa_running;
                sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6509,7 +6513,7 @@ static struct {
        unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
-static inline int find_new_ilb(int call_cpu)
+static inline int find_new_ilb(void)
 {
        int ilb = cpumask_first(nohz.idle_cpus_mask);
@@ -6524,13 +6528,13 @@ static inline int find_new_ilb(int call_cpu)
 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
 * CPU (if there is one).
 */
-static void nohz_balancer_kick(int cpu)
+static void nohz_balancer_kick(void)
 {
        int ilb_cpu;
        nohz.next_balance++;
-        ilb_cpu = find_new_ilb(cpu);
+        ilb_cpu = find_new_ilb();
        if (ilb_cpu >= nr_cpu_ids)
                return;
@@ -6640,10 +6644,10 @@ void update_max_interval(void)
 *
 * Balancing parameters are set up in init_sched_domains.
 */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 {
        int continue_balancing = 1;
-        struct rq *rq = cpu_rq(cpu);
+        int cpu = rq->cpu;
        unsigned long interval;
        struct sched_domain *sd;
        /* Earliest time when we have to do rebalance again */
@@ -6740,9 +6744,9 @@ out:
 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
 * rebalancing for all the cpus for whom scheduler ticks are stopped.
 */
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
-        struct rq *this_rq = cpu_rq(this_cpu);
+        int this_cpu = this_rq->cpu;
        struct rq *rq;
        int balance_cpu;
@@ -6769,7 +6773,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                update_idle_cpu_load(rq);
                raw_spin_unlock_irq(&rq->lock);
-                rebalance_domains(balance_cpu, CPU_IDLE);
+                rebalance_domains(rq, CPU_IDLE);
                if (time_after(this_rq->next_balance, rq->next_balance))
                        this_rq->next_balance = rq->next_balance;
@@ -6788,14 +6792,14 @@ end:
 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
 *     domain span are idle.
 */
-static inline int nohz_kick_needed(struct rq *rq, int cpu)
+static inline int nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
        struct sched_group_power *sgp;
-        int nr_busy;
+        int nr_busy, cpu = rq->cpu;
-        if (unlikely(idle_cpu(cpu)))
+        if (unlikely(rq->idle_balance))
                return 0;
       /*
@@ -6844,7 +6848,7 @@ need_kick:
        return 1;
 }
 #else
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
 #endif
 /*
@@ -6853,38 +6857,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
 */
 static void run_rebalance_domains(struct softirq_action *h)
 {
-        int this_cpu = smp_processor_id();
+        struct rq *this_rq = this_rq();
-        struct rq *this_rq = cpu_rq(this_cpu);
        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_cpu, idle);
+        rebalance_domains(this_rq, idle);
        /*
         * If this cpu has a pending nohz_balance_kick, then do the
         * balancing on behalf of the other idle cpus whose ticks are
         * stopped.
         */
-        nohz_idle_balance(this_cpu, idle);
+        nohz_idle_balance(this_rq, idle);
 }
-static inline int on_null_domain(int cpu)
+static inline int on_null_domain(struct rq *rq)
 {
-        return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+        return !rcu_dereference_sched(rq->sd);
 }
 /*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 */
-void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq)
 {
        /* Don't need to rebalance while attached to NULL domain */
-        if (time_after_eq(jiffies, rq->next_balance) &&
+        if (unlikely(on_null_domain(rq)))
-            likely(!on_null_domain(cpu)))
+                return;
+        if (time_after_eq(jiffies, rq->next_balance))
                raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ_COMMON
-        if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+        if (nohz_kick_needed(rq))
-                nohz_balancer_kick(cpu);
+                nohz_balancer_kick();
 #endif
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275fc396..a2740b775b45 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Change rq's cpupri only if rt_rq is the top queue.
+         */
+        if (&rq->rt != rt_rq)
+                return;
+#endif
        if (rq->online && prio < prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 }
@@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Change rq's cpupri only if rt_rq is the top queue.
+         */
+        if (&rq->rt != rt_rq)
+                return;
+#endif
        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
@@ -1724,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
            p->nr_cpus_allowed > 1 &&
-            rt_task(rq->curr) &&
+            (dl_task(rq->curr) || rt_task(rq->curr)) &&
            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 88c85b21d633..c2119fd20f8b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
@@ -9,6 +10,7 @@
 #include <linux/slab.h>
 #include "cpupri.h"
+#include "cpudeadline.h"
 #include "cpuacct.h"
 struct rq;
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 /*
+ * Single value that decides SCHED_DEADLINE internal math precision.
+ * 10 -> just above 1us
+ * 9  -> just above 0.5us
+ */
+#define DL_SCALE (10)
+/*
 * These are the 'tuning knobs' of the scheduler:
 */
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq);
 */
 #define RUNTIME_INF     ((u64)~0ULL)
+static inline int fair_policy(int policy)
+{
+        return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
 static inline int rt_policy(int policy)
 {
-        if (policy == SCHED_FIFO || policy == SCHED_RR)
+        return policy == SCHED_FIFO || policy == SCHED_RR;
-                return 1;
+}
-        return 0;
+static inline int dl_policy(int policy)
+{
+        return policy == SCHED_DEADLINE;
 }
 static inline int task_has_rt_policy(struct task_struct *p)
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
        return rt_policy(p->policy);
 }
+static inline int task_has_dl_policy(struct task_struct *p)
+{
+        return dl_policy(p->policy);
+}
+static inline bool dl_time_before(u64 a, u64 b)
+{
+        return (s64)(a - b) < 0;
+}
+/*
+ * Tells if entity @a should preempt entity @b.
+ */
+static inline bool
+dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+{
+        return dl_time_before(a->deadline, b->deadline);
+}
 /*
 * This is the priority-queue data structure of the RT scheduling class:
 */
@@ -108,6 +144,47 @@ struct rt_bandwidth {
        u64                     rt_runtime;
        struct hrtimer          rt_period_timer;
 };
+/*
+ * To keep the bandwidth of -deadline tasks and groups under control
+ * we need some place where:
+ *  - store the maximum -deadline bandwidth of the system (the group);
+ *  - cache the fraction of that bandwidth that is currently allocated.
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * meaning that:
+ *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
+ *  - dl_total_bw array contains, in the i-eth element, the currently
+ *    allocated bandwidth on the i-eth CPU.
+ * Moreover, groups consume bandwidth on each CPU, while tasks only
+ * consume bandwidth on the CPU they're running on.
+ * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
+ * that will be shown the next time the proc or cgroup controls will
+ * be red. It on its turn can be changed by writing on its own
+ * control.
+ */
+struct dl_bandwidth {
+        raw_spinlock_t dl_runtime_lock;
+        u64 dl_runtime;
+        u64 dl_period;
+};
+static inline int dl_bandwidth_enabled(void)
+{
+        return sysctl_sched_rt_runtime >= 0;
+}
+extern struct dl_bw *dl_bw_of(int i);
+struct dl_bw {
+        raw_spinlock_t lock;
+        u64 bw, total_bw;
+};
 extern struct mutex sched_domains_mutex;
@@ -364,6 +441,42 @@ struct rt_rq {
 #endif
 };
+/* Deadline class' related fields in a runqueue */
+struct dl_rq {
+        /* runqueue is an rbtree, ordered by deadline */
+        struct rb_root rb_root;
+        struct rb_node *rb_leftmost;
+        unsigned long dl_nr_running;
+#ifdef CONFIG_SMP
+        /*
+         * Deadline values of the currently executing and the
+         * earliest ready task on this rq. Caching these facilitates
+         * the decision wether or not a ready but not running task
+         * should migrate somewhere else.
+         */
+        struct {
+                u64 curr;
+                u64 next;
+        } earliest_dl;
+        unsigned long dl_nr_migratory;
+        unsigned long dl_nr_total;
+        int overloaded;
+        /*
+         * Tasks on this rq that can be pushed away. They are kept in
+         * an rb-tree, ordered by tasks' deadlines, with caching
+         * of the leftmost (earliest deadline) element.
+         */
+        struct rb_root pushable_dl_tasks_root;
+        struct rb_node *pushable_dl_tasks_leftmost;
+#else
+        struct dl_bw dl_bw;
+#endif
+};
 #ifdef CONFIG_SMP
 /*
@@ -382,6 +495,15 @@ struct root_domain {
        cpumask_var_t online;
        /*
+         * The bit corresponding to a CPU gets set here if such CPU has more
+         * than one runnable -deadline task (as it is below for RT tasks).
+         */
+        cpumask_var_t dlo_mask;
+        atomic_t dlo_count;
+        struct dl_bw dl_bw;
+        struct cpudl cpudl;
+        /*
         * The "RT overload" flag: it gets set if a CPU has more than
         * one runnable RT task.
         */
@@ -432,6 +554,7 @@ struct rq {
        struct cfs_rq cfs;
        struct rt_rq rt;
+        struct dl_rq dl;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
@@ -827,8 +950,6 @@ static inline u64 global_rt_runtime(void)
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 static inline int task_current(struct rq *rq, struct task_struct *p)
 {
        return rq->curr == p;
@@ -988,6 +1109,7 @@ static const u32 prio_to_wmult[40] = {
 #else
 #define ENQUEUE_WAKING          0
 #endif
+#define ENQUEUE_REPLENISH       8
 #define DEQUEUE_SLEEP           1
@@ -1023,6 +1145,7 @@ struct sched_class {
        void (*set_curr_task) (struct rq *rq);
        void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
        void (*task_fork) (struct task_struct *p);
+        void (*task_dead) (struct task_struct *p);
        void (*switched_from) (struct rq *this_rq, struct task_struct *task);
        void (*switched_to) (struct rq *this_rq, struct task_struct *task);
@@ -1042,6 +1165,7 @@ struct sched_class {
   for (class = sched_class_highest; class; class = class->next)
 extern const struct sched_class stop_sched_class;
+extern const struct sched_class dl_sched_class;
 extern const struct sched_class rt_sched_class;
 extern const struct sched_class fair_sched_class;
 extern const struct sched_class idle_sched_class;
@@ -1051,7 +1175,7 @@ extern const struct sched_class idle_sched_class;
 extern void update_group_power(struct sched_domain *sd, int cpu);
-extern void trigger_load_balance(struct rq *rq, int cpu);
+extern void trigger_load_balance(struct rq *rq);
 extern void idle_balance(int this_cpu, struct rq *this_rq);
 extern void idle_enter_fair(struct rq *this_rq);
@@ -1068,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq)
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
+extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
+extern void init_sched_dl_class(void);
 extern void resched_task(struct task_struct *p);
 extern void resched_cpu(int cpu);
@@ -1077,6 +1204,12 @@ extern void resched_cpu(int cpu);
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern struct dl_bandwidth def_dl_bandwidth;
+extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
+extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
+unsigned long to_ratio(u64 period, u64 runtime);
 extern void update_idle_cpu_load(struct rq *this_rq);
 extern void init_task_runnable_average(struct task_struct *p);
@@ -1353,6 +1486,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
 extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47197de8abd9..fdb6bb0b3356 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
 * Simple, special scheduling class for the per-CPU stop tasks:
 */
 const struct sched_class stop_sched_class = {
-        .next                   = &rt_sched_class,
+        .next                   = &dl_sched_class,
        .enqueue_task           = enqueue_task_stop,
        .dequeue_task           = dequeue_task_stop,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 11025ccc06dd..8a1e6e104892 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -89,7 +89,7 @@ static void wakeup_softirqd(void)
 * where hardirqs are disabled legitimately:
 */
 #ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip, unsigned int cnt)
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 {
        unsigned long flags;
@@ -107,33 +107,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
        /*
         * Were softirqs turned off above:
         */
-        if (softirq_count() == cnt)
+        if (softirq_count() == (cnt & SOFTIRQ_MASK))
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
        if (preempt_count() == cnt)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
-#else /* !CONFIG_TRACE_IRQFLAGS */
+EXPORT_SYMBOL(__local_bh_disable_ip);
-static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
-{
-        preempt_count_add(cnt);
-        barrier();
-}
 #endif /* CONFIG_TRACE_IRQFLAGS */
-void local_bh_disable(void)
-{
-        __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
-}
-EXPORT_SYMBOL(local_bh_disable);
 static void __local_bh_enable(unsigned int cnt)
 {
        WARN_ON_ONCE(!irqs_disabled());
-        if (softirq_count() == cnt)
+        if (softirq_count() == (cnt & SOFTIRQ_MASK))
                trace_softirqs_on(_RET_IP_);
        preempt_count_sub(cnt);
 }
@@ -151,7 +139,7 @@ void _local_bh_enable(void)
 EXPORT_SYMBOL(_local_bh_enable);
-static inline void _local_bh_enable_ip(unsigned long ip)
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 {
        WARN_ON_ONCE(in_irq() || irqs_disabled());
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
         * Keep preemption disabled until we are done with
         * softirq processing:
         */
-        preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
+        preempt_count_sub(cnt - 1);
        if (unlikely(!in_interrupt() && local_softirq_pending())) {
                /*
@@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
 #endif
        preempt_check_resched();
 }
+EXPORT_SYMBOL(__local_bh_enable_ip);
-void local_bh_enable(void)
-{
-        _local_bh_enable_ip(_RET_IP_);
-}
-EXPORT_SYMBOL(local_bh_enable);
-void local_bh_enable_ip(unsigned long ip)
-{
-        _local_bh_enable_ip(ip);
-}
-EXPORT_SYMBOL(local_bh_enable_ip);
 /*
 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
@@ -211,14 +188,48 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 #define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
 #define MAX_SOFTIRQ_RESTART 10
+#ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * When we run softirqs from irq_exit() and thus on the hardirq stack we need
+ * to keep the lockdep irq context tracking as tight as possible in order to
+ * not miss-qualify lock contexts and miss possible deadlocks.
+ */
+static inline bool lockdep_softirq_start(void)
+{
+        bool in_hardirq = false;
+        if (trace_hardirq_context(current)) {
+                in_hardirq = true;
+                trace_hardirq_exit();
+        }
+        lockdep_softirq_enter();
+        return in_hardirq;
+}
+static inline void lockdep_softirq_end(bool in_hardirq)
+{
+        lockdep_softirq_exit();
+        if (in_hardirq)
+                trace_hardirq_enter();
+}
+#else
+static inline bool lockdep_softirq_start(void) { return false; }
+static inline void lockdep_softirq_end(bool in_hardirq) { }
+#endif
 asmlinkage void __do_softirq(void)
 {
-        struct softirq_action *h;
-        __u32 pending;
        unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
-        int cpu;
        unsigned long old_flags = current->flags;
        int max_restart = MAX_SOFTIRQ_RESTART;
+        struct softirq_action *h;
+        bool in_hardirq;
+        __u32 pending;
+        int cpu;
        /*
         * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -230,8 +241,8 @@ asmlinkage void __do_softirq(void)
        pending = local_softirq_pending();
        account_irq_enter_time(current);
-        __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET);
+        __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
-        lockdep_softirq_enter();
+        in_hardirq = lockdep_softirq_start();
        cpu = smp_processor_id();
 restart:
@@ -278,16 +289,13 @@ restart:
                wakeup_softirqd();
        }
-        lockdep_softirq_exit();
+        lockdep_softirq_end(in_hardirq);
        account_irq_exit_time(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
        WARN_ON_ONCE(in_interrupt());
        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
 asmlinkage void do_softirq(void)
 {
        __u32 pending;
@@ -311,8 +319,6 @@ asmlinkage void do_softirq(void)
 */
 void irq_enter(void)
 {
-        int cpu = smp_processor_id();
        rcu_irq_enter();
        if (is_idle_task(current) && !in_interrupt()) {
                /*
@@ -320,7 +326,7 @@ void irq_enter(void)
                 * here, as softirq will be serviced on return from interrupt.
                 */
                local_bh_disable();
-                tick_check_idle(cpu);
+                tick_check_idle();
                _local_bh_enable();
        }
@@ -375,13 +381,13 @@ void irq_exit(void)
 #endif
        account_irq_exit_time(current);
-        trace_hardirq_exit();
        preempt_count_sub(HARDIRQ_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
        tick_irq_exit();
        rcu_irq_exit();
+        trace_hardirq_exit(); /* must be last! */
 }
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 34a604726d0b..332cefcdb04b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,8 +95,6 @@
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
-extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
 extern int max_threads;
 extern int suid_dumpable;
 #ifdef CONFIG_COREDUMP
@@ -385,13 +383,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "numa_balancing_settle_count",
-                .data           = &sysctl_numa_balancing_settle_count,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "numa_balancing_migrate_deferred",
                .data           = &sysctl_numa_balancing_migrate_deferred,
                .maxlen         = sizeof(unsigned int),
@@ -1128,7 +1119,14 @@ static struct ctl_table vm_table[] = {
                .data           = &sysctl_overcommit_ratio,
                .maxlen         = sizeof(sysctl_overcommit_ratio),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = overcommit_ratio_handler,
+        },
+        {
+                .procname       = "overcommit_kbytes",
+                .data           = &sysctl_overcommit_kbytes,
+                .maxlen         = sizeof(sysctl_overcommit_kbytes),
+                .mode           = 0644,
+                .proc_handler   = overcommit_kbytes_handler,
        },
        {
                .procname       = "page-cluster", 
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
index 4aef390671cb..3e9868d47535 100644
--- a/kernel/system_certificates.S
+++ b/kernel/system_certificates.S
@@ -3,8 +3,18 @@
        __INITRODATA
+        .align 8
        .globl VMLINUX_SYMBOL(system_certificate_list)
 VMLINUX_SYMBOL(system_certificate_list):
+__cert_list_start:
        .incbin "kernel/x509_certificate_list"
-        .globl VMLINUX_SYMBOL(system_certificate_list_end)
+__cert_list_end:
-VMLINUX_SYMBOL(system_certificate_list_end):
+        .align 8
+        .globl VMLINUX_SYMBOL(system_certificate_list_size)
+VMLINUX_SYMBOL(system_certificate_list_size):
+#ifdef CONFIG_64BIT
+        .quad __cert_list_end - __cert_list_start
+#else
+        .long __cert_list_end - __cert_list_start
+#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 564dd93430a2..52ebc70263f4 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -22,7 +22,7 @@ struct key *system_trusted_keyring;
 EXPORT_SYMBOL_GPL(system_trusted_keyring);
 extern __initconst const u8 system_certificate_list[];
-extern __initconst const u8 system_certificate_list_end[];
+extern __initconst const unsigned long system_certificate_list_size;
 /*
 * Load the compiled-in keys
@@ -60,8 +60,8 @@ static __init int load_system_certificate_list(void)
        pr_notice("Loading compiled-in X.509 certificates\n");
-        end = system_certificate_list_end;
        p = system_certificate_list;
+        end = p + system_certificate_list_size;
        while (p < end) {
                /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
                 * than 256 bytes in size.
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 68b799375981..0abb36464281 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -74,7 +74,7 @@ unsigned long long notrace sched_clock(void)
                return cd.epoch_ns;
        do {
-                seq = read_seqcount_begin(&cd.seq);
+                seq = raw_read_seqcount_begin(&cd.seq);
                epoch_cyc = cd.epoch_cyc;
                epoch_ns = cd.epoch_ns;
        } while (read_seqcount_retry(&cd.seq, seq));
@@ -99,10 +99,10 @@ static void notrace update_sched_clock(void)
                          cd.mult, cd.shift);
        raw_local_irq_save(flags);
-        write_seqcount_begin(&cd.seq);
+        raw_write_seqcount_begin(&cd.seq);
        cd.epoch_ns = ns;
        cd.epoch_cyc = cyc;
-        write_seqcount_end(&cd.seq);
+        raw_write_seqcount_end(&cd.seq);
        raw_local_irq_restore(flags);
 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690daaa9..43780ab5e279 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -538,10 +538,10 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 * Called from irq_enter() when idle was interrupted to reenable the
 * per cpu device.
 */
-void tick_check_oneshot_broadcast(int cpu)
+void tick_check_oneshot_broadcast_this_cpu(void)
 {
-        if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
+        if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
-                struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+                struct tick_device *td = &__get_cpu_var(tick_cpu_device);
                /*
                 * We might be in the middle of switching over from
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 64522ecdfe0e..20b2fe37d105 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 */
 ktime_t tick_next_period;
 ktime_t tick_period;
+/*
+ * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
+ * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
+ * variable has two functions:
+ *
+ * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
+ *    timekeeping lock all at once. Only the CPU which is assigned to do the
+ *    update is handling it.
+ *
+ * 2) Hand off the duty in the NOHZ idle case by setting the value to
+ *    TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
+ *    at it will take over and keep the time keeping alive.  The handover
+ *    procedure also covers cpu hotplug.
+ */
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 /*
@@ -70,6 +85,7 @@ static void tick_periodic(int cpu)
                do_timer(1);
                write_sequnlock(&jiffies_lock);
+                update_wall_time();
        }
        update_process_times(user_mode(get_irq_regs()));
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 18e71f7fbc2a..8329669b51ec 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -51,7 +51,7 @@ extern void tick_broadcast_switch_to_oneshot(void);
 extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 extern int tick_broadcast_oneshot_active(void);
-extern void tick_check_oneshot_broadcast(int cpu);
+extern void tick_check_oneshot_broadcast_this_cpu(void);
 bool tick_broadcast_oneshot_available(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
@@ -62,7 +62,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
 static inline bool tick_broadcast_oneshot_available(void) { return true; }
 # endif /* !BROADCAST */
@@ -155,3 +155,4 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 #endif
 extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc77f834..08cb0c3b8ccb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -86,6 +86,7 @@ static void tick_do_update_jiffies64(ktime_t now)
                tick_next_period = ktime_add(last_jiffies_update, tick_period);
        }
        write_sequnlock(&jiffies_lock);
+        update_wall_time();
 }
 /*
@@ -177,7 +178,7 @@ static bool can_stop_full_tick(void)
         * TODO: kick full dynticks CPUs when
         * sched_clock_stable is set.
         */
-        if (!sched_clock_stable) {
+        if (!sched_clock_stable()) {
                trace_tick_stop(0, "unstable sched clock\n");
                /*
                 * Don't allow the user to think they can get
@@ -361,8 +362,8 @@ void __init tick_nohz_init(void)
 /*
 * NO HZ enabled ?
 */
-int tick_nohz_enabled __read_mostly  = 1;
+static int tick_nohz_enabled __read_mostly  = 1;
+int tick_nohz_active  __read_mostly;
 /*
 * Enable / Disable tickless mode
 */
@@ -391,11 +392,9 @@ __setup("nohz=", setup_tick_nohz);
 */
 static void tick_nohz_update_jiffies(ktime_t now)
 {
-        int cpu = smp_processor_id();
-        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
-        ts->idle_waketime = now;
+        __this_cpu_write(tick_cpu_sched.idle_waketime, now);
        local_irq_save(flags);
        tick_do_update_jiffies64(now);
@@ -426,17 +425,15 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
 }
-static void tick_nohz_stop_idle(int cpu, ktime_t now)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 {
-        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        update_ts_time_stats(smp_processor_id(), ts, now, NULL);
-        update_ts_time_stats(cpu, ts, now, NULL);
        ts->idle_active = 0;
        sched_clock_idle_wakeup_event(0);
 }
-static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
+static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 {
        ktime_t now = ktime_get();
@@ -465,7 +462,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t now, idle;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return -1;
        now = ktime_get();
@@ -506,7 +503,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t now, iowait;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return -1;
        now = ktime_get();
@@ -711,8 +708,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
                return false;
        }
-        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
+                ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
                return false;
+        }
        if (need_resched())
                return false;
@@ -752,7 +751,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
        ktime_t now, expires;
        int cpu = smp_processor_id();
-        now = tick_nohz_start_idle(cpu, ts);
+        now = tick_nohz_start_idle(ts);
        if (can_stop_idle_tick(cpu, ts)) {
                int was_stopped = ts->tick_stopped;
@@ -799,11 +798,6 @@ void tick_nohz_idle_enter(void)
        local_irq_disable();
        ts = &__get_cpu_var(tick_cpu_sched);
-        /*
-         * set ts->inidle unconditionally. even if the system did not
-         * switch to nohz mode the cpu frequency governers rely on the
-         * update of the idle time accounting in tick_nohz_start_idle().
-         */
        ts->inidle = 1;
        __tick_nohz_idle_enter(ts);
@@ -914,8 +908,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 */
 void tick_nohz_idle_exit(void)
 {
-        int cpu = smp_processor_id();
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
-        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t now;
        local_irq_disable();
@@ -928,7 +921,7 @@ void tick_nohz_idle_exit(void)
                now = ktime_get();
        if (ts->idle_active)
-                tick_nohz_stop_idle(cpu, now);
+                tick_nohz_stop_idle(ts, now);
        if (ts->tick_stopped) {
                tick_nohz_restart_sched_tick(ts, now);
@@ -973,7 +966,7 @@ static void tick_nohz_switch_to_nohz(void)
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t next;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return;
        local_irq_disable();
@@ -981,7 +974,7 @@ static void tick_nohz_switch_to_nohz(void)
                local_irq_enable();
                return;
        }
+        tick_nohz_active = 1;
        ts->nohz_mode = NOHZ_MODE_LOWRES;
        /*
@@ -1012,12 +1005,10 @@ static void tick_nohz_switch_to_nohz(void)
 * timer and do not touch the other magic bits which need to be done
 * when idle is left.
 */
-static void tick_nohz_kick_tick(int cpu, ktime_t now)
+static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
 {
 #if 0
        /* Switch back to 2.6.27 behaviour */
-        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t delta;
        /*
@@ -1032,36 +1023,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now)
 #endif
 }
-static inline void tick_check_nohz(int cpu)
+static inline void tick_check_nohz_this_cpu(void)
 {
-        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t now;
        if (!ts->idle_active && !ts->tick_stopped)
                return;
        now = ktime_get();
        if (ts->idle_active)
-                tick_nohz_stop_idle(cpu, now);
+                tick_nohz_stop_idle(ts, now);
        if (ts->tick_stopped) {
                tick_nohz_update_jiffies(now);
-                tick_nohz_kick_tick(cpu, now);
+                tick_nohz_kick_tick(ts, now);
        }
 }
 #else
 static inline void tick_nohz_switch_to_nohz(void) { }
-static inline void tick_check_nohz(int cpu) { }
+static inline void tick_check_nohz_this_cpu(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 /*
 * Called from irq_enter to notify about the possible interruption of idle()
 */
-void tick_check_idle(int cpu)
+void tick_check_idle(void)
 {
-        tick_check_oneshot_broadcast(cpu);
+        tick_check_oneshot_broadcast_this_cpu();
-        tick_check_nohz(cpu);
+        tick_check_nohz_this_cpu();
 }
 /*
@@ -1139,8 +1130,10 @@ void tick_setup_sched_timer(void)
        }
 #ifdef CONFIG_NO_HZ_COMMON
-        if (tick_nohz_enabled)
+        if (tick_nohz_enabled) {
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
+                tick_nohz_active = 1;
+        }
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3abf53418b67..0aa4ce81bc16 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -77,7 +77,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
        tk->wall_to_monotonic = wtm;
        set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
        tk->offs_real = timespec_to_ktime(tmp);
-        tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
+        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
 }
 static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -90,8 +90,9 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
 }
 /**
- * timekeeper_setup_internals - Set up internals to use clocksource clock.
+ * tk_setup_internals - Set up internals to use clocksource clock.
 *
+ * @tk:         The target timekeeper to setup.
 * @clock:              Pointer to clocksource.
 *
 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
@@ -595,7 +596,7 @@ s32 timekeeping_get_tai_offset(void)
 static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 {
        tk->tai_offset = tai_offset;
-        tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
+        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
 }
 /**
@@ -610,6 +611,7 @@ void timekeeping_set_tai_offset(s32 tai_offset)
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&timekeeper_seq);
        __timekeeping_set_tai_offset(tk, tai_offset);
+        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        clock_was_set();
@@ -1023,6 +1025,8 @@ static int timekeeping_suspend(void)
                timekeeping_suspend_time =
                        timespec_add(timekeeping_suspend_time, delta_delta);
        }
+        timekeeping_update(tk, TK_MIRROR);
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1130,16 +1134,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
                 * we can adjust by 1.
                 */
                error >>= 2;
-                /*
-                 * XXX - In update_wall_time, we round up to the next
-                 * nanosecond, and store the amount rounded up into
-                 * the error. This causes the likely below to be unlikely.
-                 *
-                 * The proper fix is to avoid rounding up by using
-                 * the high precision tk->xtime_nsec instead of
-                 * xtime.tv_nsec everywhere. Fixing this will take some
-                 * time.
-                 */
                if (likely(error <= interval))
                        adj = 1;
                else
@@ -1255,7 +1249,7 @@ out_adjust:
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
        u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
-        unsigned int action = 0;
+        unsigned int clock_set = 0;
        while (tk->xtime_nsec >= nsecps) {
                int leap;
@@ -1277,11 +1271,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
-                        clock_was_set_delayed();
+                        clock_set = TK_CLOCK_WAS_SET;
-                        action = TK_CLOCK_WAS_SET;
                }
        }
-        return action;
+        return clock_set;
 }
 /**
@@ -1294,7 +1287,8 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 * Returns the unconsumed cycles.
 */
 static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
-                                                u32 shift)
+                                                u32 shift,
+                                                unsigned int *clock_set)
 {
        cycle_t interval = tk->cycle_interval << shift;
        u64 raw_nsecs;
@@ -1308,7 +1302,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        tk->cycle_last += interval;
        tk->xtime_nsec += tk->xtime_interval << shift;
-        accumulate_nsecs_to_secs(tk);
+        *clock_set |= accumulate_nsecs_to_secs(tk);
        /* Accumulate raw time */
        raw_nsecs = (u64)tk->raw_interval << shift;
@@ -1347,7 +1341,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
        tk->xtime_nsec -= remainder;
        tk->xtime_nsec += 1ULL << tk->shift;
        tk->ntp_error += remainder << tk->ntp_error_shift;
+        tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
@@ -1359,14 +1353,14 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 */
-static void update_wall_time(void)
+void update_wall_time(void)
 {
        struct clocksource *clock;
        struct timekeeper *real_tk = &timekeeper;
        struct timekeeper *tk = &shadow_timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
-        unsigned int action;
+        unsigned int clock_set = 0;
        unsigned long flags;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1401,7 +1395,8 @@ static void update_wall_time(void)
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= tk->cycle_interval) {
-                offset = logarithmic_accumulation(tk, offset, shift);
+                offset = logarithmic_accumulation(tk, offset, shift,
+                                                        &clock_set);
                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }
@@ -1419,7 +1414,7 @@ static void update_wall_time(void)
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
-        action = accumulate_nsecs_to_secs(tk);
+        clock_set |= accumulate_nsecs_to_secs(tk);
        write_seqcount_begin(&timekeeper_seq);
        /* Update clock->cycle_last with the new value */
@@ -1435,10 +1430,12 @@ static void update_wall_time(void)
         * updating.
         */
        memcpy(real_tk, tk, sizeof(*tk));
-        timekeeping_update(real_tk, action);
+        timekeeping_update(real_tk, clock_set);
        write_seqcount_end(&timekeeper_seq);
 out:
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+        if (clock_set)
+                clock_was_set();
 }
 /**
@@ -1583,7 +1580,6 @@ struct timespec get_monotonic_coarse(void)
 void do_timer(unsigned long ticks)
 {
        jiffies_64 += ticks;
-        update_wall_time();
        calc_global_load(ticks);
 }
@@ -1698,12 +1694,14 @@ int do_adjtimex(struct timex *txc)
        if (tai != orig_tai) {
                __timekeeping_set_tai_offset(tk, tai);
-                update_pvclock_gtod(tk, true);
+                timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
-                clock_was_set_delayed();
        }
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+        if (tai != orig_tai)
+                clock_was_set();
        ntp_notify_cmos_timer();
        return ret;
@@ -1739,4 +1737,5 @@ void xtime_update(unsigned long ticks)
        write_seqlock(&jiffies_lock);
        do_timer(ticks);
        write_sequnlock(&jiffies_lock);
+        update_wall_time();
 }
diff --git a/kernel/timer.c b/kernel/timer.c
index 6582b82fa966..accfd241b9e5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu)
                        /*
                         * The APs use this path later in boot
                         */
-                        base = kmalloc_node(sizeof(*base),
+                        base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-                                                GFP_KERNEL | __GFP_ZERO,
+                                            cpu_to_node(cpu));
-                                                cpu_to_node(cpu));
                        if (!base)
                                return -ENOMEM;
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d7e2068e4b71..1378e84fbe39 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -50,6 +50,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM_RUNTIME),y)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 22fa55696760..cd7f76d1eb86 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -85,6 +85,8 @@ int function_trace_stop __read_mostly;
 /* Current function tracing op */
 struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
+/* What to set function_trace_op to */
+static struct ftrace_ops *set_function_trace_op;
 /* List for set_ftrace_pid's pids. */
 LIST_HEAD(ftrace_pids);
@@ -278,6 +280,29 @@ static void update_global_ops(void)
        global_ops.func = func;
 }
+static void ftrace_sync(struct work_struct *work)
+{
+        /*
+         * This function is just a stub to implement a hard force
+         * of synchronize_sched(). This requires synchronizing
+         * tasks even in userspace and idle.
+         *
+         * Yes, function tracing is rude.
+         */
+}
+static void ftrace_sync_ipi(void *data)
+{
+        /* Probably not needed, but do it anyway */
+        smp_rmb();
+}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void update_function_graph_func(void);
+#else
+static inline void update_function_graph_func(void) { }
+#endif
 static void update_ftrace_function(void)
 {
        ftrace_func_t func;
@@ -296,16 +321,61 @@ static void update_ftrace_function(void)
             !FTRACE_FORCE_LIST_FUNC)) {
                /* Set the ftrace_ops that the arch callback uses */
                if (ftrace_ops_list == &global_ops)
-                        function_trace_op = ftrace_global_list;
+                        set_function_trace_op = ftrace_global_list;
                else
-                        function_trace_op = ftrace_ops_list;
+                        set_function_trace_op = ftrace_ops_list;
                func = ftrace_ops_list->func;
        } else {
                /* Just use the default ftrace_ops */
-                function_trace_op = &ftrace_list_end;
+                set_function_trace_op = &ftrace_list_end;
                func = ftrace_ops_list_func;
        }
+        /* If there's no change, then do nothing more here */
+        if (ftrace_trace_function == func)
+                return;
+        update_function_graph_func();
+        /*
+         * If we are using the list function, it doesn't care
+         * about the function_trace_ops.
+         */
+        if (func == ftrace_ops_list_func) {
+                ftrace_trace_function = func;
+                /*
+                 * Don't even bother setting function_trace_ops,
+                 * it would be racy to do so anyway.
+                 */
+                return;
+        }
+#ifndef CONFIG_DYNAMIC_FTRACE
+        /*
+         * For static tracing, we need to be a bit more careful.
+         * The function change takes affect immediately. Thus,
+         * we need to coorditate the setting of the function_trace_ops
+         * with the setting of the ftrace_trace_function.
+         *
+         * Set the function to the list ops, which will call the
+         * function we want, albeit indirectly, but it handles the
+         * ftrace_ops and doesn't depend on function_trace_op.
+         */
+        ftrace_trace_function = ftrace_ops_list_func;
+        /*
+         * Make sure all CPUs see this. Yes this is slow, but static
+         * tracing is slow and nasty to have enabled.
+         */
+        schedule_on_each_cpu(ftrace_sync);
+        /* Now all cpus are using the list ops. */
+        function_trace_op = set_function_trace_op;
+        /* Make sure the function_trace_op is visible on all CPUs */
+        smp_wmb();
+        /* Nasty way to force a rmb on all cpus */
+        smp_call_function(ftrace_sync_ipi, NULL, 1);
+        /* OK, we are all set to update the ftrace_trace_function now! */
+#endif /* !CONFIG_DYNAMIC_FTRACE */
        ftrace_trace_function = func;
 }
@@ -367,9 +437,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-        if (unlikely(ftrace_disabled))
-                return -ENODEV;
        if (FTRACE_WARN_ON(ops == &global_ops))
                return -EINVAL;
@@ -413,24 +480,10 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        return 0;
 }
-static void ftrace_sync(struct work_struct *work)
-{
-        /*
-         * This function is just a stub to implement a hard force
-         * of synchronize_sched(). This requires synchronizing
-         * tasks even in userspace and idle.
-         *
-         * Yes, function tracing is rude.
-         */
-}
 static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
        int ret;
-        if (ftrace_disabled)
-                return -ENODEV;
        if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
                return -EBUSY;
@@ -445,20 +498,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
        } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
                ret = remove_ftrace_list_ops(&ftrace_control_list,
                                             &control_ops, ops);
-                if (!ret) {
-                        /*
-                         * The ftrace_ops is now removed from the list,
-                         * so there'll be no new users. We must ensure
-                         * all current users are done before we free
-                         * the control data.
-                         * Note synchronize_sched() is not enough, as we
-                         * use preempt_disable() to do RCU, but the function
-                         * tracer can be called where RCU is not active
-                         * (before user_exit()).
-                         */
-                        schedule_on_each_cpu(ftrace_sync);
-                        control_ops_free(ops);
-                }
        } else
                ret = remove_ftrace_ops(&ftrace_ops_list, ops);
@@ -468,17 +507,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
        if (ftrace_enabled)
                update_ftrace_function();
-        /*
-         * Dynamic ops may be freed, we must make sure that all
-         * callers are done before leaving this function.
-         *
-         * Again, normal synchronize_sched() is not good enough.
-         * We need to do a hard force of sched synchronization.
-         */
-        if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
-                schedule_on_each_cpu(ftrace_sync);
        return 0;
 }
@@ -781,7 +809,7 @@ static int ftrace_profile_init(void)
        int cpu;
        int ret = 0;
-        for_each_online_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                ret = ftrace_profile_init_cpu(cpu);
                if (ret)
                        break;
@@ -1088,19 +1116,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
-{
-        loff_t ret;
-        if (file->f_mode & FMODE_READ)
-                ret = seq_lseek(file, offset, whence);
-        else
-                file->f_pos = ret = 1;
-        return ret;
-}
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1998,8 +2013,14 @@ void ftrace_modify_all_code(int command)
        else if (command & FTRACE_DISABLE_CALLS)
                ftrace_replace_code(0);
-        if (update && ftrace_trace_function != ftrace_ops_list_func)
+        if (update && ftrace_trace_function != ftrace_ops_list_func) {
+                function_trace_op = set_function_trace_op;
+                smp_wmb();
+                /* If irqs are disabled, we are in stop machine */
+                if (!irqs_disabled())
+                        smp_call_function(ftrace_sync_ipi, NULL, 1);
                ftrace_update_ftrace_func(ftrace_trace_function);
+        }
        if (command & FTRACE_START_FUNC_RET)
                ftrace_enable_ftrace_graph_caller();
@@ -2088,10 +2109,15 @@ static void ftrace_startup_enable(int command)
 static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
        bool hash_enable = true;
+        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
+        ret = __register_ftrace_function(ops);
+        if (ret)
+                return ret;
        ftrace_start_up++;
        command |= FTRACE_UPDATE_CALLS;
@@ -2113,12 +2139,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
        return 0;
 }
-static void ftrace_shutdown(struct ftrace_ops *ops, int command)
+static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 {
        bool hash_disable = true;
+        int ret;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
+        ret = __unregister_ftrace_function(ops);
+        if (ret)
+                return ret;
        ftrace_start_up--;
        /*
@@ -2152,10 +2183,42 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
                command |= FTRACE_UPDATE_TRACE_FUNC;
        }
-        if (!command || !ftrace_enabled)
+        if (!command || !ftrace_enabled) {
-                return;
+                /*
+                 * If these are control ops, they still need their
+                 * per_cpu field freed. Since, function tracing is
+                 * not currently active, we can just free them
+                 * without synchronizing all CPUs.
+                 */
+                if (ops->flags & FTRACE_OPS_FL_CONTROL)
+                        control_ops_free(ops);
+                return 0;
+        }
        ftrace_run_update_code(command);
+        /*
+         * Dynamic ops may be freed, we must make sure that all
+         * callers are done before leaving this function.
+         * The same goes for freeing the per_cpu data of the control
+         * ops.
+         *
+         * Again, normal synchronize_sched() is not good enough.
+         * We need to do a hard force of sched synchronization.
+         * This is because we use preempt_disable() to do RCU, but
+         * the function tracers can be called where RCU is not watching
+         * (like before user_exit()). We can not rely on the RCU
+         * infrastructure to do the synchronization, thus we must do it
+         * ourselves.
+         */
+        if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+                schedule_on_each_cpu(ftrace_sync);
+                if (ops->flags & FTRACE_OPS_FL_CONTROL)
+                        control_ops_free(ops);
+        }
+        return 0;
 }
 static void ftrace_startup_sysctl(void)
@@ -2734,7 +2797,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
 * routine, you can use ftrace_filter_write() for the write
 * routine if @flag has FTRACE_ITER_FILTER set, or
 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_filter_lseek() should be used as the lseek routine, and
+ * tracing_lseek() should be used as the lseek routine, and
 * release must call ftrace_regex_release().
 */
 int
@@ -3060,16 +3123,13 @@ static void __enable_ftrace_function_probe(void)
        if (i == FTRACE_FUNC_HASHSIZE)
                return;
-        ret = __register_ftrace_function(&trace_probe_ops);
+        ret = ftrace_startup(&trace_probe_ops, 0);
-        if (!ret)
-                ret = ftrace_startup(&trace_probe_ops, 0);
        ftrace_probe_registered = 1;
 }
 static void __disable_ftrace_function_probe(void)
 {
-        int ret;
        int i;
        if (!ftrace_probe_registered)
@@ -3082,9 +3142,7 @@ static void __disable_ftrace_function_probe(void)
        }
        /* no more funcs left */
-        ret = __unregister_ftrace_function(&trace_probe_ops);
+        ftrace_shutdown(&trace_probe_ops, 0);
-        if (!ret)
-                ftrace_shutdown(&trace_probe_ops, 0);
        ftrace_probe_registered = 0;
 }
@@ -3767,7 +3825,7 @@ static const struct file_operations ftrace_filter_fops = {
        .open = ftrace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
-        .llseek = ftrace_filter_lseek,
+        .llseek = tracing_lseek,
        .release = ftrace_regex_release,
 };
@@ -3775,7 +3833,7 @@ static const struct file_operations ftrace_notrace_fops = {
        .open = ftrace_notrace_open,
        .read = seq_read,
        .write = ftrace_notrace_write,
-        .llseek = ftrace_filter_lseek,
+        .llseek = tracing_lseek,
        .release = ftrace_regex_release,
 };
@@ -4038,7 +4096,7 @@ static const struct file_operations ftrace_graph_fops = {
        .open           = ftrace_graph_open,
        .read           = seq_read,
        .write          = ftrace_graph_write,
-        .llseek         = ftrace_filter_lseek,
+        .llseek         = tracing_lseek,
        .release        = ftrace_graph_release,
 };
@@ -4046,7 +4104,7 @@ static const struct file_operations ftrace_graph_notrace_fops = {
        .open           = ftrace_graph_notrace_open,
        .read           = seq_read,
        .write          = ftrace_graph_write,
-        .llseek         = ftrace_filter_lseek,
+        .llseek         = tracing_lseek,
        .release        = ftrace_graph_release,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4366,12 +4424,15 @@ core_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command)                   \
+# define ftrace_startup(ops, command)                                   \
-        ({                                              \
+        ({                                                              \
-                (ops)->flags |= FTRACE_OPS_FL_ENABLED;  \
+                int ___ret = __register_ftrace_function(ops);           \
-                0;                                      \
+                if (!___ret)                                            \
+                        (ops)->flags |= FTRACE_OPS_FL_ENABLED;          \
+                ___ret;                                                 \
        })
-# define ftrace_shutdown(ops, command)  do { } while (0)
+# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
 # define ftrace_startup_sysctl()        do { } while (0)
 # define ftrace_shutdown_sysctl()       do { } while (0)
@@ -4716,7 +4777,7 @@ static const struct file_operations ftrace_pid_fops = {
        .open           = ftrace_pid_open,
        .write          = ftrace_pid_write,
        .read           = seq_read,
-        .llseek         = ftrace_filter_lseek,
+        .llseek         = tracing_lseek,
        .release        = ftrace_pid_release,
 };
@@ -4780,9 +4841,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
-        ret = __register_ftrace_function(ops);
+        ret = ftrace_startup(ops, 0);
-        if (!ret)
-                ret = ftrace_startup(ops, 0);
        mutex_unlock(&ftrace_lock);
@@ -4801,9 +4860,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
        int ret;
        mutex_lock(&ftrace_lock);
-        ret = __unregister_ftrace_function(ops);
+        ret = ftrace_shutdown(ops, 0);
-        if (!ret)
-                ftrace_shutdown(ops, 0);
        mutex_unlock(&ftrace_lock);
        return ret;
@@ -4863,6 +4920,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 trace_func_graph_ret_t ftrace_graph_return =
                        (trace_func_graph_ret_t)ftrace_stub;
 trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
+static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -4997,6 +5055,37 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
        return NOTIFY_DONE;
 }
+/* Just a place holder for function graph */
+static struct ftrace_ops fgraph_ops __read_mostly = {
+        .func           = ftrace_stub,
+        .flags          = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
+                                FTRACE_OPS_FL_RECURSION_SAFE,
+};
+static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
+{
+        if (!ftrace_ops_test(&global_ops, trace->func, NULL))
+                return 0;
+        return __ftrace_graph_entry(trace);
+}
+/*
+ * The function graph tracer should only trace the functions defined
+ * by set_ftrace_filter and set_ftrace_notrace. If another function
+ * tracer ops is registered, the graph tracer requires testing the
+ * function against the global ops, and not just trace any function
+ * that any ftrace_ops registered.
+ */
+static void update_function_graph_func(void)
+{
+        if (ftrace_ops_list == &ftrace_list_end ||
+            (ftrace_ops_list == &global_ops &&
+             global_ops.next == &ftrace_list_end))
+                ftrace_graph_entry = __ftrace_graph_entry;
+        else
+                ftrace_graph_entry = ftrace_graph_entry_test;
+}
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                        trace_func_graph_ent_t entryfunc)
 {
@@ -5021,9 +5110,18 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        }
        ftrace_graph_return = retfunc;
-        ftrace_graph_entry = entryfunc;
-        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+        /*
+         * Update the indirect function to the entryfunc, and the
+         * function that gets called to the entry_test first. Then
+         * call the update fgraph entry function to determine if
+         * the entryfunc should be called directly or not.
+         */
+        __ftrace_graph_entry = entryfunc;
+        ftrace_graph_entry = ftrace_graph_entry_test;
+        update_function_graph_func();
+        ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
@@ -5040,7 +5138,8 @@ void unregister_ftrace_graph(void)
        ftrace_graph_active--;
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
-        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+        __ftrace_graph_entry = ftrace_graph_entry_stub;
+        ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cc2f66f68dc5..294b8a271a04 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                if (unlikely(test_time_stamp(delta))) {
                        int local_clock_stable = 1;
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-                        local_clock_stable = sched_clock_stable;
+                        local_clock_stable = sched_clock_stable();
 #endif
                        WARN_ONCE(delta > (1ULL << 59),
                                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d20cd9743ef..20c755e018ca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -595,6 +595,28 @@ void free_snapshot(struct trace_array *tr)
 }
 /**
+ * tracing_alloc_snapshot - allocate snapshot buffer.
+ *
+ * This only allocates the snapshot buffer if it isn't already
+ * allocated - it doesn't also take a snapshot.
+ *
+ * This is meant to be used in cases where the snapshot buffer needs
+ * to be set up for events that can't sleep but need to be able to
+ * trigger a snapshot.
+ */
+int tracing_alloc_snapshot(void)
+{
+        struct trace_array *tr = &global_trace;
+        int ret;
+        ret = alloc_snapshot(tr);
+        WARN_ON(ret < 0);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
+/**
 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
 *
 * This is similar to trace_snapshot(), but it will allocate the
@@ -607,11 +629,10 @@ void free_snapshot(struct trace_array *tr)
 */
 void tracing_snapshot_alloc(void)
 {
-        struct trace_array *tr = &global_trace;
        int ret;
-        ret = alloc_snapshot(tr);
+        ret = tracing_alloc_snapshot();
-        if (WARN_ON(ret < 0))
+        if (ret < 0)
                return;
        tracing_snapshot();
@@ -623,6 +644,12 @@ void tracing_snapshot(void)
        WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
 }
 EXPORT_SYMBOL_GPL(tracing_snapshot);
+int tracing_alloc_snapshot(void)
+{
+        WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
+        return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
 void tracing_snapshot_alloc(void)
 {
        /* Give warning */
@@ -3156,19 +3183,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
        return count;
 }
-static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
 {
+        int ret;
        if (file->f_mode & FMODE_READ)
-                return seq_lseek(file, offset, origin);
+                ret = seq_lseek(file, offset, whence);
        else
-                return 0;
+                file->f_pos = ret = 0;
+        return ret;
 }
 static const struct file_operations tracing_fops = {
        .open           = tracing_open,
        .read           = seq_read,
        .write          = tracing_write_stub,
-        .llseek         = tracing_seek,
+        .llseek         = tracing_lseek,
        .release        = tracing_release,
 };
@@ -4212,12 +4243,6 @@ out:
        return sret;
 }
-static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
-                                     struct pipe_buffer *buf)
-{
-        __free_page(buf->page);
-}
 static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
                                     unsigned int idx)
 {
@@ -4229,7 +4254,7 @@ static const struct pipe_buf_operations tracing_pipe_buf_ops = {
        .map                    = generic_pipe_buf_map,
        .unmap                  = generic_pipe_buf_unmap,
        .confirm                = generic_pipe_buf_confirm,
-        .release                = tracing_pipe_buf_release,
+        .release                = generic_pipe_buf_release,
        .steal                  = generic_pipe_buf_steal,
        .get                    = generic_pipe_buf_get,
 };
@@ -4913,7 +4938,7 @@ static const struct file_operations snapshot_fops = {
        .open           = tracing_snapshot_open,
        .read           = seq_read,
        .write          = tracing_snapshot_write,
-        .llseek         = tracing_seek,
+        .llseek         = tracing_lseek,
        .release        = tracing_snapshot_release,
 };
@@ -5883,6 +5908,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
        rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+        buf->tr = tr;
        buf->buffer = ring_buffer_alloc(size, rb_flags);
        if (!buf->buffer)
                return -ENOMEM;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ea189e027b80..02b592f2d4b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1,3 +1,4 @@
 #ifndef _LINUX_KERNEL_TRACE_H
 #define _LINUX_KERNEL_TRACE_H
@@ -587,6 +588,8 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 int is_tracing_stopped(void);
+loff_t tracing_lseek(struct file *file, loff_t offset, int whence);
 extern cpumask_var_t __read_mostly tracing_buffer_mask;
 #define for_each_tracing_cpu(cpu)       \
@@ -1020,6 +1023,10 @@ extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
 extern void print_subsystem_event_filter(struct event_subsystem *system,
                                         struct trace_seq *s);
 extern int filter_assign_type(const char *type);
+extern int create_event_filter(struct ftrace_event_call *call,
+                               char *filter_str, bool set_str,
+                               struct event_filter **filterp);
+extern void free_event_filter(struct event_filter *filter);
 struct ftrace_event_field *
 trace_find_event_field(struct ftrace_event_call *call, char *name);
@@ -1028,9 +1035,195 @@ extern void trace_event_enable_cmd_record(bool enable);
 extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
 extern int event_trace_del_tracer(struct trace_array *tr);
+extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
+                                                 const char *system,
+                                                 const char *event);
+static inline void *event_file_data(struct file *filp)
+{
+        return ACCESS_ONCE(file_inode(filp)->i_private);
+}
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
+extern const struct file_operations event_trigger_fops;
+extern int register_trigger_cmds(void);
+extern void clear_event_triggers(struct trace_array *tr);
+struct event_trigger_data {
+        unsigned long                   count;
+        int                             ref;
+        struct event_trigger_ops        *ops;
+        struct event_command            *cmd_ops;
+        struct event_filter __rcu       *filter;
+        char                            *filter_str;
+        void                            *private_data;
+        struct list_head                list;
+};
+/**
+ * struct event_trigger_ops - callbacks for trace event triggers
+ *
+ * The methods in this structure provide per-event trigger hooks for
+ * various trigger operations.
+ *
+ * All the methods below, except for @init() and @free(), must be
+ * implemented.
+ *
+ * @func: The trigger 'probe' function called when the triggering
+ *      event occurs.  The data passed into this callback is the data
+ *      that was supplied to the event_command @reg() function that
+ *      registered the trigger (see struct event_command).
+ *
+ * @init: An optional initialization function called for the trigger
+ *      when the trigger is registered (via the event_command reg()
+ *      function).  This can be used to perform per-trigger
+ *      initialization such as incrementing a per-trigger reference
+ *      count, for instance.  This is usually implemented by the
+ *      generic utility function @event_trigger_init() (see
+ *      trace_event_triggers.c).
+ *
+ * @free: An optional de-initialization function called for the
+ *      trigger when the trigger is unregistered (via the
+ *      event_command @reg() function).  This can be used to perform
+ *      per-trigger de-initialization such as decrementing a
+ *      per-trigger reference count and freeing corresponding trigger
+ *      data, for instance.  This is usually implemented by the
+ *      generic utility function @event_trigger_free() (see
+ *      trace_event_triggers.c).
+ *
+ * @print: The callback function invoked to have the trigger print
+ *      itself.  This is usually implemented by a wrapper function
+ *      that calls the generic utility function @event_trigger_print()
+ *      (see trace_event_triggers.c).
+ */
+struct event_trigger_ops {
+        void                    (*func)(struct event_trigger_data *data);
+        int                     (*init)(struct event_trigger_ops *ops,
+                                        struct event_trigger_data *data);
+        void                    (*free)(struct event_trigger_ops *ops,
+                                        struct event_trigger_data *data);
+        int                     (*print)(struct seq_file *m,
+                                         struct event_trigger_ops *ops,
+                                         struct event_trigger_data *data);
+};
+/**
+ * struct event_command - callbacks and data members for event commands
+ *
+ * Event commands are invoked by users by writing the command name
+ * into the 'trigger' file associated with a trace event.  The
+ * parameters associated with a specific invocation of an event
+ * command are used to create an event trigger instance, which is
+ * added to the list of trigger instances associated with that trace
+ * event.  When the event is hit, the set of triggers associated with
+ * that event is invoked.
+ *
+ * The data members in this structure provide per-event command data
+ * for various event commands.
+ *
+ * All the data members below, except for @post_trigger, must be set
+ * for each event command.
+ *
+ * @name: The unique name that identifies the event command.  This is
+ *      the name used when setting triggers via trigger files.
+ *
+ * @trigger_type: A unique id that identifies the event command
+ *      'type'.  This value has two purposes, the first to ensure that
+ *      only one trigger of the same type can be set at a given time
+ *      for a particular event e.g. it doesn't make sense to have both
+ *      a traceon and traceoff trigger attached to a single event at
+ *      the same time, so traceon and traceoff have the same type
+ *      though they have different names.  The @trigger_type value is
+ *      also used as a bit value for deferring the actual trigger
+ *      action until after the current event is finished.  Some
+ *      commands need to do this if they themselves log to the trace
+ *      buffer (see the @post_trigger() member below).  @trigger_type
+ *      values are defined by adding new values to the trigger_type
+ *      enum in include/linux/ftrace_event.h.
+ *
+ * @post_trigger: A flag that says whether or not this command needs
+ *      to have its action delayed until after the current event has
+ *      been closed.  Some triggers need to avoid being invoked while
+ *      an event is currently in the process of being logged, since
+ *      the trigger may itself log data into the trace buffer.  Thus
+ *      we make sure the current event is committed before invoking
+ *      those triggers.  To do that, the trigger invocation is split
+ *      in two - the first part checks the filter using the current
+ *      trace record; if a command has the @post_trigger flag set, it
+ *      sets a bit for itself in the return value, otherwise it
+ *      directly invokes the trigger.  Once all commands have been
+ *      either invoked or set their return flag, the current record is
+ *      either committed or discarded.  At that point, if any commands
+ *      have deferred their triggers, those commands are finally
+ *      invoked following the close of the current event.  In other
+ *      words, if the event_trigger_ops @func() probe implementation
+ *      itself logs to the trace buffer, this flag should be set,
+ *      otherwise it can be left unspecified.
+ *
+ * All the methods below, except for @set_filter(), must be
+ * implemented.
+ *
+ * @func: The callback function responsible for parsing and
+ *      registering the trigger written to the 'trigger' file by the
+ *      user.  It allocates the trigger instance and registers it with
+ *      the appropriate trace event.  It makes use of the other
+ *      event_command callback functions to orchestrate this, and is
+ *      usually implemented by the generic utility function
+ *      @event_trigger_callback() (see trace_event_triggers.c).
+ *
+ * @reg: Adds the trigger to the list of triggers associated with the
+ *      event, and enables the event trigger itself, after
+ *      initializing it (via the event_trigger_ops @init() function).
+ *      This is also where commands can use the @trigger_type value to
+ *      make the decision as to whether or not multiple instances of
+ *      the trigger should be allowed.  This is usually implemented by
+ *      the generic utility function @register_trigger() (see
+ *      trace_event_triggers.c).
+ *
+ * @unreg: Removes the trigger from the list of triggers associated
+ *      with the event, and disables the event trigger itself, after
+ *      initializing it (via the event_trigger_ops @free() function).
+ *      This is usually implemented by the generic utility function
+ *      @unregister_trigger() (see trace_event_triggers.c).
+ *
+ * @set_filter: An optional function called to parse and set a filter
+ *      for the trigger.  If no @set_filter() method is set for the
+ *      event command, filters set by the user for the command will be
+ *      ignored.  This is usually implemented by the generic utility
+ *      function @set_trigger_filter() (see trace_event_triggers.c).
+ *
+ * @get_trigger_ops: The callback function invoked to retrieve the
+ *      event_trigger_ops implementation associated with the command.
+ */
+struct event_command {
+        struct list_head        list;
+        char                    *name;
+        enum event_trigger_type trigger_type;
+        bool                    post_trigger;
+        int                     (*func)(struct event_command *cmd_ops,
+                                        struct ftrace_event_file *file,
+                                        char *glob, char *cmd, char *params);
+        int                     (*reg)(char *glob,
+                                       struct event_trigger_ops *ops,
+                                       struct event_trigger_data *data,
+                                       struct ftrace_event_file *file);
+        void                    (*unreg)(char *glob,
+                                         struct event_trigger_ops *ops,
+                                         struct event_trigger_data *data,
+                                         struct ftrace_event_file *file);
+        int                     (*set_filter)(char *filter_str,
+                                              struct event_trigger_data *data,
+                                              struct ftrace_event_file *file);
+        struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
+};
+extern int trace_event_enable_disable(struct ftrace_event_file *file,
+                                      int enable, int soft_disable);
+extern int tracing_alloc_snapshot(void);
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 78e27e3b52ac..e854f420e033 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,12 @@ static int	total_ref_count;
 static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
+        if (tp_event->perf_perm) {
+                int ret = tp_event->perf_perm(tp_event, p_event);
+                if (ret)
+                        return ret;
+        }
        /* The ftrace function trace is allowed only for root. */
        if (ftrace_event_is_function(tp_event) &&
            perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
@@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 int perf_trace_init(struct perf_event *p_event)
 {
        struct ftrace_event_call *tp_event;
-        int event_id = p_event->attr.config;
+        u64 event_id = p_event->attr.config;
        int ret = -EINVAL;
        mutex_lock(&event_mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f919a2e21bf3..e71ffd4eccb5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -342,6 +342,12 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
        return ret;
 }
+int trace_event_enable_disable(struct ftrace_event_file *file,
+                               int enable, int soft_disable)
+{
+        return __ftrace_event_enable_disable(file, enable, soft_disable);
+}
 static int ftrace_event_enable_disable(struct ftrace_event_file *file,
                                       int enable)
 {
@@ -421,11 +427,6 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
        }
 }
-static void *event_file_data(struct file *filp)
-{
-        return ACCESS_ONCE(file_inode(filp)->i_private);
-}
 static void remove_event_file_dir(struct ftrace_event_file *file)
 {
        struct dentry *dir = file->dir;
@@ -1549,6 +1550,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
        trace_create_file("filter", 0644, file->dir, file,
                          &ftrace_event_filter_fops);
+        trace_create_file("trigger", 0644, file->dir, file,
+                          &event_trigger_fops);
        trace_create_file("format", 0444, file->dir, call,
                          &ftrace_event_format_fops);
@@ -1645,6 +1649,8 @@ trace_create_new_event(struct ftrace_event_call *call,
        file->event_call = call;
        file->tr = tr;
        atomic_set(&file->sm_ref, 0);
+        atomic_set(&file->tm_ref, 0);
+        INIT_LIST_HEAD(&file->triggers);
        list_add(&file->list, &tr->events);
        return file;
@@ -1849,20 +1855,7 @@ __trace_add_event_dirs(struct trace_array *tr)
        }
 }
-#ifdef CONFIG_DYNAMIC_FTRACE
+struct ftrace_event_file *
-/* Avoid typos */
-#define ENABLE_EVENT_STR        "enable_event"
-#define DISABLE_EVENT_STR       "disable_event"
-struct event_probe_data {
-        struct ftrace_event_file        *file;
-        unsigned long                   count;
-        int                             ref;
-        bool                            enable;
-};
-static struct ftrace_event_file *
 find_event_file(struct trace_array *tr, const char *system,  const char *event)
 {
        struct ftrace_event_file *file;
@@ -1885,6 +1878,19 @@ find_event_file(struct trace_array *tr, const char *system,  const char *event)
        return NULL;
 }
+#ifdef CONFIG_DYNAMIC_FTRACE
+/* Avoid typos */
+#define ENABLE_EVENT_STR        "enable_event"
+#define DISABLE_EVENT_STR       "disable_event"
+struct event_probe_data {
+        struct ftrace_event_file        *file;
+        unsigned long                   count;
+        int                             ref;
+        bool                            enable;
+};
 static void
 event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
 {
@@ -2311,9 +2317,15 @@ int event_trace_del_tracer(struct trace_array *tr)
 {
        mutex_lock(&event_mutex);
+        /* Disable any event triggers and associated soft-disabled events */
+        clear_event_triggers(tr);
        /* Disable any running events */
        __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+        /* Access to events are within rcu_read_lock_sched() */
+        synchronize_sched();
        down_write(&trace_event_sem);
        __trace_remove_event_dirs(tr);
        debugfs_remove_recursive(tr->event_dir);
@@ -2374,6 +2386,8 @@ static __init int event_trace_enable(void)
        register_event_cmds();
+        register_trigger_cmds();
        return 0;
 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2468f56dc5db..8a8631926a07 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -799,6 +799,11 @@ static void __free_filter(struct event_filter *filter)
        kfree(filter);
 }
+void free_event_filter(struct event_filter *filter)
+{
+        __free_filter(filter);
+}
 void destroy_call_preds(struct ftrace_event_call *call)
 {
        __free_filter(call->filter);
@@ -1938,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call,
        return err;
 }
+int create_event_filter(struct ftrace_event_call *call,
+                        char *filter_str, bool set_str,
+                        struct event_filter **filterp)
+{
+        return create_filter(call, filter_str, set_str, filterp);
+}
 /**
 * create_system_filter - create a filter for an event_subsystem
 * @system: event_subsystem to create a filter for
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
new file mode 100644
index 000000000000..8efbb69b04f0
--- /dev/null
+++ b/kernel/trace/trace_events_trigger.c
@@ -0,0 +1,1437 @@
+/*
+ * trace_events_trigger - trace event triggers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include "trace.h"
+static LIST_HEAD(trigger_commands);
+static DEFINE_MUTEX(trigger_cmd_mutex);
+static void
+trigger_data_free(struct event_trigger_data *data)
+{
+        if (data->cmd_ops->set_filter)
+                data->cmd_ops->set_filter(NULL, data, NULL);
+        synchronize_sched(); /* make sure current triggers exit before free */
+        kfree(data);
+}
+/**
+ * event_triggers_call - Call triggers associated with a trace event
+ * @file: The ftrace_event_file associated with the event
+ * @rec: The trace entry for the event, NULL for unconditional invocation
+ *
+ * For each trigger associated with an event, invoke the trigger
+ * function registered with the associated trigger command.  If rec is
+ * non-NULL, it means that the trigger requires further processing and
+ * shouldn't be unconditionally invoked.  If rec is non-NULL and the
+ * trigger has a filter associated with it, rec will checked against
+ * the filter and if the record matches the trigger will be invoked.
+ * If the trigger is a 'post_trigger', meaning it shouldn't be invoked
+ * in any case until the current event is written, the trigger
+ * function isn't invoked but the bit associated with the deferred
+ * trigger is set in the return value.
+ *
+ * Returns an enum event_trigger_type value containing a set bit for
+ * any trigger that should be deferred, ETT_NONE if nothing to defer.
+ *
+ * Called from tracepoint handlers (with rcu_read_lock_sched() held).
+ *
+ * Return: an enum event_trigger_type value containing a set bit for
+ * any trigger that should be deferred, ETT_NONE if nothing to defer.
+ */
+enum event_trigger_type
+event_triggers_call(struct ftrace_event_file *file, void *rec)
+{
+        struct event_trigger_data *data;
+        enum event_trigger_type tt = ETT_NONE;
+        struct event_filter *filter;
+        if (list_empty(&file->triggers))
+                return tt;
+        list_for_each_entry_rcu(data, &file->triggers, list) {
+                if (!rec) {
+                        data->ops->func(data);
+                        continue;
+                }
+                filter = rcu_dereference(data->filter);
+                if (filter && !filter_match_preds(filter, rec))
+                        continue;
+                if (data->cmd_ops->post_trigger) {
+                        tt |= data->cmd_ops->trigger_type;
+                        continue;
+                }
+                data->ops->func(data);
+        }
+        return tt;
+}
+EXPORT_SYMBOL_GPL(event_triggers_call);
+/**
+ * event_triggers_post_call - Call 'post_triggers' for a trace event
+ * @file: The ftrace_event_file associated with the event
+ * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
+ *
+ * For each trigger associated with an event, invoke the trigger
+ * function registered with the associated trigger command, if the
+ * corresponding bit is set in the tt enum passed into this function.
+ * See @event_triggers_call for details on how those bits are set.
+ *
+ * Called from tracepoint handlers (with rcu_read_lock_sched() held).
+ */
+void
+event_triggers_post_call(struct ftrace_event_file *file,
+                         enum event_trigger_type tt)
+{
+        struct event_trigger_data *data;
+        list_for_each_entry_rcu(data, &file->triggers, list) {
+                if (data->cmd_ops->trigger_type & tt)
+                        data->ops->func(data);
+        }
+}
+EXPORT_SYMBOL_GPL(event_triggers_post_call);
+#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL)
+static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
+{
+        struct ftrace_event_file *event_file = event_file_data(m->private);
+        if (t == SHOW_AVAILABLE_TRIGGERS)
+                return NULL;
+        return seq_list_next(t, &event_file->triggers, pos);
+}
+static void *trigger_start(struct seq_file *m, loff_t *pos)
+{
+        struct ftrace_event_file *event_file;
+        /* ->stop() is called even if ->start() fails */
+        mutex_lock(&event_mutex);
+        event_file = event_file_data(m->private);
+        if (unlikely(!event_file))
+                return ERR_PTR(-ENODEV);
+        if (list_empty(&event_file->triggers))
+                return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL;
+        return seq_list_start(&event_file->triggers, *pos);
+}
+static void trigger_stop(struct seq_file *m, void *t)
+{
+        mutex_unlock(&event_mutex);
+}
+static int trigger_show(struct seq_file *m, void *v)
+{
+        struct event_trigger_data *data;
+        struct event_command *p;
+        if (v == SHOW_AVAILABLE_TRIGGERS) {
+                seq_puts(m, "# Available triggers:\n");
+                seq_putc(m, '#');
+                mutex_lock(&trigger_cmd_mutex);
+                list_for_each_entry_reverse(p, &trigger_commands, list)
+                        seq_printf(m, " %s", p->name);
+                seq_putc(m, '\n');
+                mutex_unlock(&trigger_cmd_mutex);
+                return 0;
+        }
+        data = list_entry(v, struct event_trigger_data, list);
+        data->ops->print(m, data->ops, data);
+        return 0;
+}
+static const struct seq_operations event_triggers_seq_ops = {
+        .start = trigger_start,
+        .next = trigger_next,
+        .stop = trigger_stop,
+        .show = trigger_show,
+};
+static int event_trigger_regex_open(struct inode *inode, struct file *file)
+{
+        int ret = 0;
+        mutex_lock(&event_mutex);
+        if (unlikely(!event_file_data(file))) {
+                mutex_unlock(&event_mutex);
+                return -ENODEV;
+        }
+        if (file->f_mode & FMODE_READ) {
+                ret = seq_open(file, &event_triggers_seq_ops);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = file;
+                }
+        }
+        mutex_unlock(&event_mutex);
+        return ret;
+}
+static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
+{
+        char *command, *next = buff;
+        struct event_command *p;
+        int ret = -EINVAL;
+        command = strsep(&next, ": \t");
+        command = (command[0] != '!') ? command : command + 1;
+        mutex_lock(&trigger_cmd_mutex);
+        list_for_each_entry(p, &trigger_commands, list) {
+                if (strcmp(p->name, command) == 0) {
+                        ret = p->func(p, file, buff, command, next);
+                        goto out_unlock;
+                }
+        }
+ out_unlock:
+        mutex_unlock(&trigger_cmd_mutex);
+        return ret;
+}
+static ssize_t event_trigger_regex_write(struct file *file,
+                                         const char __user *ubuf,
+                                         size_t cnt, loff_t *ppos)
+{
+        struct ftrace_event_file *event_file;
+        ssize_t ret;
+        char *buf;
+        if (!cnt)
+                return 0;
+        if (cnt >= PAGE_SIZE)
+                return -EINVAL;
+        buf = (char *)__get_free_page(GFP_TEMPORARY);
+        if (!buf)
+                return -ENOMEM;
+        if (copy_from_user(buf, ubuf, cnt)) {
+                free_page((unsigned long)buf);
+                return -EFAULT;
+        }
+        buf[cnt] = '\0';
+        strim(buf);
+        mutex_lock(&event_mutex);
+        event_file = event_file_data(file);
+        if (unlikely(!event_file)) {
+                mutex_unlock(&event_mutex);
+                free_page((unsigned long)buf);
+                return -ENODEV;
+        }
+        ret = trigger_process_regex(event_file, buf);
+        mutex_unlock(&event_mutex);
+        free_page((unsigned long)buf);
+        if (ret < 0)
+                goto out;
+        *ppos += cnt;
+        ret = cnt;
+ out:
+        return ret;
+}
+static int event_trigger_regex_release(struct inode *inode, struct file *file)
+{
+        mutex_lock(&event_mutex);
+        if (file->f_mode & FMODE_READ)
+                seq_release(inode, file);
+        mutex_unlock(&event_mutex);
+        return 0;
+}
+static ssize_t
+event_trigger_write(struct file *filp, const char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+        return event_trigger_regex_write(filp, ubuf, cnt, ppos);
+}
+static int
+event_trigger_open(struct inode *inode, struct file *filp)
+{
+        return event_trigger_regex_open(inode, filp);
+}
+static int
+event_trigger_release(struct inode *inode, struct file *file)
+{
+        return event_trigger_regex_release(inode, file);
+}
+const struct file_operations event_trigger_fops = {
+        .open = event_trigger_open,
+        .read = seq_read,
+        .write = event_trigger_write,
+        .llseek = tracing_lseek,
+        .release = event_trigger_release,
+};
+/*
+ * Currently we only register event commands from __init, so mark this
+ * __init too.
+ */
+static __init int register_event_command(struct event_command *cmd)
+{
+        struct event_command *p;
+        int ret = 0;
+        mutex_lock(&trigger_cmd_mutex);
+        list_for_each_entry(p, &trigger_commands, list) {
+                if (strcmp(cmd->name, p->name) == 0) {
+                        ret = -EBUSY;
+                        goto out_unlock;
+                }
+        }
+        list_add(&cmd->list, &trigger_commands);
+ out_unlock:
+        mutex_unlock(&trigger_cmd_mutex);
+        return ret;
+}
+/*
+ * Currently we only unregister event commands from __init, so mark
+ * this __init too.
+ */
+static __init int unregister_event_command(struct event_command *cmd)
+{
+        struct event_command *p, *n;
+        int ret = -ENODEV;
+        mutex_lock(&trigger_cmd_mutex);
+        list_for_each_entry_safe(p, n, &trigger_commands, list) {
+                if (strcmp(cmd->name, p->name) == 0) {
+                        ret = 0;
+                        list_del_init(&p->list);
+                        goto out_unlock;
+                }
+        }
+ out_unlock:
+        mutex_unlock(&trigger_cmd_mutex);
+        return ret;
+}
+/**
+ * event_trigger_print - Generic event_trigger_ops @print implementation
+ * @name: The name of the event trigger
+ * @m: The seq_file being printed to
+ * @data: Trigger-specific data
+ * @filter_str: filter_str to print, if present
+ *
+ * Common implementation for event triggers to print themselves.
+ *
+ * Usually wrapped by a function that simply sets the @name of the
+ * trigger command and then invokes this.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_print(const char *name, struct seq_file *m,
+                    void *data, char *filter_str)
+{
+        long count = (long)data;
+        seq_printf(m, "%s", name);
+        if (count == -1)
+                seq_puts(m, ":unlimited");
+        else
+                seq_printf(m, ":count=%ld", count);
+        if (filter_str)
+                seq_printf(m, " if %s\n", filter_str);
+        else
+                seq_puts(m, "\n");
+        return 0;
+}
+/**
+ * event_trigger_init - Generic event_trigger_ops @init implementation
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data
+ *
+ * Common implementation of event trigger initialization.
+ *
+ * Usually used directly as the @init method in event trigger
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_init(struct event_trigger_ops *ops,
+                   struct event_trigger_data *data)
+{
+        data->ref++;
+        return 0;
+}
+/**
+ * event_trigger_free - Generic event_trigger_ops @free implementation
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data
+ *
+ * Common implementation of event trigger de-initialization.
+ *
+ * Usually used directly as the @free method in event trigger
+ * implementations.
+ */
+static void
+event_trigger_free(struct event_trigger_ops *ops,
+                   struct event_trigger_data *data)
+{
+        if (WARN_ON_ONCE(data->ref <= 0))
+                return;
+        data->ref--;
+        if (!data->ref)
+                trigger_data_free(data);
+}
+static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
+                                              int trigger_enable)
+{
+        int ret = 0;
+        if (trigger_enable) {
+                if (atomic_inc_return(&file->tm_ref) > 1)
+                        return ret;
+                set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+                ret = trace_event_enable_disable(file, 1, 1);
+        } else {
+                if (atomic_dec_return(&file->tm_ref) > 0)
+                        return ret;
+                clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+                ret = trace_event_enable_disable(file, 0, 1);
+        }
+        return ret;
+}
+/**
+ * clear_event_triggers - Clear all triggers associated with a trace array
+ * @tr: The trace array to clear
+ *
+ * For each trigger, the triggering event has its tm_ref decremented
+ * via trace_event_trigger_enable_disable(), and any associated event
+ * (in the case of enable/disable_event triggers) will have its sm_ref
+ * decremented via free()->trace_event_enable_disable().  That
+ * combination effectively reverses the soft-mode/trigger state added
+ * by trigger registration.
+ *
+ * Must be called with event_mutex held.
+ */
+void
+clear_event_triggers(struct trace_array *tr)
+{
+        struct ftrace_event_file *file;
+        list_for_each_entry(file, &tr->events, list) {
+                struct event_trigger_data *data;
+                list_for_each_entry_rcu(data, &file->triggers, list) {
+                        trace_event_trigger_enable_disable(file, 0);
+                        if (data->ops->free)
+                                data->ops->free(data->ops, data);
+                }
+        }
+}
+/**
+ * update_cond_flag - Set or reset the TRIGGER_COND bit
+ * @file: The ftrace_event_file associated with the event
+ *
+ * If an event has triggers and any of those triggers has a filter or
+ * a post_trigger, trigger invocation needs to be deferred until after
+ * the current event has logged its data, and the event should have
+ * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
+ * cleared.
+ */
+static void update_cond_flag(struct ftrace_event_file *file)
+{
+        struct event_trigger_data *data;
+        bool set_cond = false;
+        list_for_each_entry_rcu(data, &file->triggers, list) {
+                if (data->filter || data->cmd_ops->post_trigger) {
+                        set_cond = true;
+                        break;
+                }
+        }
+        if (set_cond)
+                set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+        else
+                clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+}
+/**
+ * register_trigger - Generic event_command @reg implementation
+ * @glob: The raw string used to register the trigger
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data to associate with the trigger
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event trigger registration.
+ *
+ * Usually used directly as the @reg method in event command
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int register_trigger(char *glob, struct event_trigger_ops *ops,
+                            struct event_trigger_data *data,
+                            struct ftrace_event_file *file)
+{
+        struct event_trigger_data *test;
+        int ret = 0;
+        list_for_each_entry_rcu(test, &file->triggers, list) {
+                if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+        }
+        if (data->ops->init) {
+                ret = data->ops->init(data->ops, data);
+                if (ret < 0)
+                        goto out;
+        }
+        list_add_rcu(&data->list, &file->triggers);
+        ret++;
+        if (trace_event_trigger_enable_disable(file, 1) < 0) {
+                list_del_rcu(&data->list);
+                ret--;
+        }
+        update_cond_flag(file);
+out:
+        return ret;
+}
+/**
+ * unregister_trigger - Generic event_command @unreg implementation
+ * @glob: The raw string used to register the trigger
+ * @ops: The trigger ops associated with the trigger
+ * @test: Trigger-specific data used to find the trigger to remove
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event trigger unregistration.
+ *
+ * Usually used directly as the @unreg method in event command
+ * implementations.
+ */
+static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
+                               struct event_trigger_data *test,
+                               struct ftrace_event_file *file)
+{
+        struct event_trigger_data *data;
+        bool unregistered = false;
+        list_for_each_entry_rcu(data, &file->triggers, list) {
+                if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
+                        unregistered = true;
+                        list_del_rcu(&data->list);
+                        update_cond_flag(file);
+                        trace_event_trigger_enable_disable(file, 0);
+                        break;
+                }
+        }
+        if (unregistered && data->ops->free)
+                data->ops->free(data->ops, data);
+}
+/**
+ * event_trigger_callback - Generic event_command @func implementation
+ * @cmd_ops: The command ops, used for trigger registration
+ * @file: The ftrace_event_file associated with the event
+ * @glob: The raw string used to register the trigger
+ * @cmd: The cmd portion of the string used to register the trigger
+ * @param: The params portion of the string used to register the trigger
+ *
+ * Common implementation for event command parsing and trigger
+ * instantiation.
+ *
+ * Usually used directly as the @func method in event command
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_callback(struct event_command *cmd_ops,
+                       struct ftrace_event_file *file,
+                       char *glob, char *cmd, char *param)
+{
+        struct event_trigger_data *trigger_data;
+        struct event_trigger_ops *trigger_ops;
+        char *trigger = NULL;
+        char *number;
+        int ret;
+        /* separate the trigger from the filter (t:n [if filter]) */
+        if (param && isdigit(param[0]))
+                trigger = strsep(&param, " \t");
+        trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+        ret = -ENOMEM;
+        trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+        if (!trigger_data)
+                goto out;
+        trigger_data->count = -1;
+        trigger_data->ops = trigger_ops;
+        trigger_data->cmd_ops = cmd_ops;
+        INIT_LIST_HEAD(&trigger_data->list);
+        if (glob[0] == '!') {
+                cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+                kfree(trigger_data);
+                ret = 0;
+                goto out;
+        }
+        if (trigger) {
+                number = strsep(&trigger, ":");
+                ret = -EINVAL;
+                if (!strlen(number))
+                        goto out_free;
+                /*
+                 * We use the callback data field (which is a pointer)
+                 * as our counter.
+                 */
+                ret = kstrtoul(number, 0, &trigger_data->count);
+                if (ret)
+                        goto out_free;
+        }
+        if (!param) /* if param is non-empty, it's supposed to be a filter */
+                goto out_reg;
+        if (!cmd_ops->set_filter)
+                goto out_reg;
+        ret = cmd_ops->set_filter(param, trigger_data, file);
+        if (ret < 0)
+                goto out_free;
+ out_reg:
+        ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+        /*
+         * The above returns on success the # of functions enabled,
+         * but if it didn't find any functions it returns zero.
+         * Consider no functions a failure too.
+         */
+        if (!ret) {
+                ret = -ENOENT;
+                goto out_free;
+        } else if (ret < 0)
+                goto out_free;
+        ret = 0;
+ out:
+        return ret;
+ out_free:
+        if (cmd_ops->set_filter)
+                cmd_ops->set_filter(NULL, trigger_data, NULL);
+        kfree(trigger_data);
+        goto out;
+}
+/**
+ * set_trigger_filter - Generic event_command @set_filter implementation
+ * @filter_str: The filter string for the trigger, NULL to remove filter
+ * @trigger_data: Trigger-specific data
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event command filter parsing and filter
+ * instantiation.
+ *
+ * Usually used directly as the @set_filter method in event command
+ * implementations.
+ *
+ * Also used to remove a filter (if filter_str = NULL).
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int set_trigger_filter(char *filter_str,
+                              struct event_trigger_data *trigger_data,
+                              struct ftrace_event_file *file)
+{
+        struct event_trigger_data *data = trigger_data;
+        struct event_filter *filter = NULL, *tmp;
+        int ret = -EINVAL;
+        char *s;
+        if (!filter_str) /* clear the current filter */
+                goto assign;
+        s = strsep(&filter_str, " \t");
+        if (!strlen(s) || strcmp(s, "if") != 0)
+                goto out;
+        if (!filter_str)
+                goto out;
+        /* The filter is for the 'trigger' event, not the triggered event */
+        ret = create_event_filter(file->event_call, filter_str, false, &filter);
+        if (ret)
+                goto out;
+ assign:
+        tmp = rcu_access_pointer(data->filter);
+        rcu_assign_pointer(data->filter, filter);
+        if (tmp) {
+                /* Make sure the call is done with the filter */
+                synchronize_sched();
+                free_event_filter(tmp);
+        }
+        kfree(data->filter_str);
+        data->filter_str = NULL;
+        if (filter_str) {
+                data->filter_str = kstrdup(filter_str, GFP_KERNEL);
+                if (!data->filter_str) {
+                        free_event_filter(rcu_access_pointer(data->filter));
+                        data->filter = NULL;
+                        ret = -ENOMEM;
+                }
+        }
+ out:
+        return ret;
+}
+static void
+traceon_trigger(struct event_trigger_data *data)
+{
+        if (tracing_is_on())
+                return;
+        tracing_on();
+}
+static void
+traceon_count_trigger(struct event_trigger_data *data)
+{
+        if (tracing_is_on())
+                return;
+        if (!data->count)
+                return;
+        if (data->count != -1)
+                (data->count)--;
+        tracing_on();
+}
+static void
+traceoff_trigger(struct event_trigger_data *data)
+{
+        if (!tracing_is_on())
+                return;
+        tracing_off();
+}
+static void
+traceoff_count_trigger(struct event_trigger_data *data)
+{
+        if (!tracing_is_on())
+                return;
+        if (!data->count)
+                return;
+        if (data->count != -1)
+                (data->count)--;
+        tracing_off();
+}
+static int
+traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+                      struct event_trigger_data *data)
+{
+        return event_trigger_print("traceon", m, (void *)data->count,
+                                   data->filter_str);
+}
+static int
+traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+                       struct event_trigger_data *data)
+{
+        return event_trigger_print("traceoff", m, (void *)data->count,
+                                   data->filter_str);
+}
+static struct event_trigger_ops traceon_trigger_ops = {
+        .func                   = traceon_trigger,
+        .print                  = traceon_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_trigger_free,
+};
+static struct event_trigger_ops traceon_count_trigger_ops = {
+        .func                   = traceon_count_trigger,
+        .print                  = traceon_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_trigger_free,
+};
+static struct event_trigger_ops traceoff_trigger_ops = {
+        .func                   = traceoff_trigger,
+        .print                  = traceoff_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_trigger_free,
+};
+static struct event_trigger_ops traceoff_count_trigger_ops = {
+        .func                   = traceoff_count_trigger,
+        .print                  = traceoff_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_trigger_free,
+};
+static struct event_trigger_ops *
+onoff_get_trigger_ops(char *cmd, char *param)
+{
+        struct event_trigger_ops *ops;
+        /* we register both traceon and traceoff to this callback */
+        if (strcmp(cmd, "traceon") == 0)
+                ops = param ? &traceon_count_trigger_ops :
+                        &traceon_trigger_ops;
+        else
+                ops = param ? &traceoff_count_trigger_ops :
+                        &traceoff_trigger_ops;
+        return ops;
+}
+static struct event_command trigger_traceon_cmd = {
+        .name                   = "traceon",
+        .trigger_type           = ETT_TRACE_ONOFF,
+        .func                   = event_trigger_callback,
+        .reg                    = register_trigger,
+        .unreg                  = unregister_trigger,
+        .get_trigger_ops        = onoff_get_trigger_ops,
+        .set_filter             = set_trigger_filter,
+};
+static struct event_command trigger_traceoff_cmd = {
+        .name                   = "traceoff",
+        .trigger_type           = ETT_TRACE_ONOFF,
+        .func                   = event_trigger_callback,
+        .reg                    = register_trigger,
+        .unreg                  = unregister_trigger,
+        .get_trigger_ops        = onoff_get_trigger_ops,
+        .set_filter             = set_trigger_filter,
+};
+#ifdef CONFIG_TRACER_SNAPSHOT
+static void
+snapshot_trigger(struct event_trigger_data *data)
+{
+        tracing_snapshot();
+}
+static void
+snapshot_count_trigger(struct event_trigger_data *data)
+{
+        if (!data->count)
+                return;
+        if (data->count != -1)
+                (data->count)--;
+        snapshot_trigger(data);
+}
+static int
+register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
+                          struct event_trigger_data *data,
+                          struct ftrace_event_file *file)
+{
+        int ret = register_trigger(glob, ops, data, file);
+        if (ret > 0 && tracing_alloc_snapshot() != 0) {
+                unregister_trigger(glob, ops, data, file);
+                ret = 0;
+        }
+        return ret;
+}
+static int
+snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+                       struct event_trigger_data *data)
+{
+        return event_trigger_print("snapshot", m, (void *)data->count,
+                                   data->filter_str);
+}
+static struct event_trigger_ops snapshot_trigger_ops = {
+        .func                   = snapshot_trigger,
+        .print                  = snapshot_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_trigger_free,
+};
+static struct event_trigger_ops snapshot_count_trigger_ops = {
+        .func                   = snapshot_count_trigger,
+        .print                  = snapshot_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_trigger_free,
+};
+static struct event_trigger_ops *
+snapshot_get_trigger_ops(char *cmd, char *param)
+{
+        return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
+}
+static struct event_command trigger_snapshot_cmd = {
+        .name                   = "snapshot",
+        .trigger_type           = ETT_SNAPSHOT,
+        .func                   = event_trigger_callback,
+        .reg                    = register_snapshot_trigger,
+        .unreg                  = unregister_trigger,
+        .get_trigger_ops        = snapshot_get_trigger_ops,
+        .set_filter             = set_trigger_filter,
+};
+static __init int register_trigger_snapshot_cmd(void)
+{
+        int ret;
+        ret = register_event_command(&trigger_snapshot_cmd);
+        WARN_ON(ret < 0);
+        return ret;
+}
+#else
+static __init int register_trigger_snapshot_cmd(void) { return 0; }
+#endif /* CONFIG_TRACER_SNAPSHOT */
+#ifdef CONFIG_STACKTRACE
+/*
+ * Skip 3:
+ *   stacktrace_trigger()
+ *   event_triggers_post_call()
+ *   ftrace_raw_event_xxx()
+ */
+#define STACK_SKIP 3
+static void
+stacktrace_trigger(struct event_trigger_data *data)
+{
+        trace_dump_stack(STACK_SKIP);
+}
+static void
+stacktrace_count_trigger(struct event_trigger_data *data)
+{
+        if (!data->count)
+                return;
+        if (data->count != -1)
+                (data->count)--;
+        stacktrace_trigger(data);
+}
+static int
+stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+                         struct event_trigger_data *data)
+{
+        return event_trigger_print("stacktrace", m, (void *)data->count,
+                                   data->filter_str);
+}
+static struct event_trigger_ops stacktrace_trigger_ops = {
+        .func                   = stacktrace_trigger,
+        .print                  = stacktrace_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_trigger_free,
+};
+static struct event_trigger_ops stacktrace_count_trigger_ops = {
+        .func                   = stacktrace_count_trigger,
+        .print                  = stacktrace_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_trigger_free,
+};
+static struct event_trigger_ops *
+stacktrace_get_trigger_ops(char *cmd, char *param)
+{
+        return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
+}
+static struct event_command trigger_stacktrace_cmd = {
+        .name                   = "stacktrace",
+        .trigger_type           = ETT_STACKTRACE,
+        .post_trigger           = true,
+        .func                   = event_trigger_callback,
+        .reg                    = register_trigger,
+        .unreg                  = unregister_trigger,
+        .get_trigger_ops        = stacktrace_get_trigger_ops,
+        .set_filter             = set_trigger_filter,
+};
+static __init int register_trigger_stacktrace_cmd(void)
+{
+        int ret;
+        ret = register_event_command(&trigger_stacktrace_cmd);
+        WARN_ON(ret < 0);
+        return ret;
+}
+#else
+static __init int register_trigger_stacktrace_cmd(void) { return 0; }
+#endif /* CONFIG_STACKTRACE */
+static __init void unregister_trigger_traceon_traceoff_cmds(void)
+{
+        unregister_event_command(&trigger_traceon_cmd);
+        unregister_event_command(&trigger_traceoff_cmd);
+}
+/* Avoid typos */
+#define ENABLE_EVENT_STR        "enable_event"
+#define DISABLE_EVENT_STR       "disable_event"
+struct enable_trigger_data {
+        struct ftrace_event_file        *file;
+        bool                            enable;
+};
+static void
+event_enable_trigger(struct event_trigger_data *data)
+{
+        struct enable_trigger_data *enable_data = data->private_data;
+        if (enable_data->enable)
+                clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+        else
+                set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+}
+static void
+event_enable_count_trigger(struct event_trigger_data *data)
+{
+        struct enable_trigger_data *enable_data = data->private_data;
+        if (!data->count)
+                return;
+        /* Skip if the event is in a state we want to switch to */
+        if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+                return;
+        if (data->count != -1)
+                (data->count)--;
+        event_enable_trigger(data);
+}
+static int
+event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+                           struct event_trigger_data *data)
+{
+        struct enable_trigger_data *enable_data = data->private_data;
+        seq_printf(m, "%s:%s:%s",
+                   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+                   enable_data->file->event_call->class->system,
+                   enable_data->file->event_call->name);
+        if (data->count == -1)
+                seq_puts(m, ":unlimited");
+        else
+                seq_printf(m, ":count=%ld", data->count);
+        if (data->filter_str)
+                seq_printf(m, " if %s\n", data->filter_str);
+        else
+                seq_puts(m, "\n");
+        return 0;
+}
+static void
+event_enable_trigger_free(struct event_trigger_ops *ops,
+                          struct event_trigger_data *data)
+{
+        struct enable_trigger_data *enable_data = data->private_data;
+        if (WARN_ON_ONCE(data->ref <= 0))
+                return;
+        data->ref--;
+        if (!data->ref) {
+                /* Remove the SOFT_MODE flag */
+                trace_event_enable_disable(enable_data->file, 0, 1);
+                module_put(enable_data->file->event_call->mod);
+                trigger_data_free(data);
+                kfree(enable_data);
+        }
+}
+static struct event_trigger_ops event_enable_trigger_ops = {
+        .func                   = event_enable_trigger,
+        .print                  = event_enable_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_enable_trigger_free,
+};
+static struct event_trigger_ops event_enable_count_trigger_ops = {
+        .func                   = event_enable_count_trigger,
+        .print                  = event_enable_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_enable_trigger_free,
+};
+static struct event_trigger_ops event_disable_trigger_ops = {
+        .func                   = event_enable_trigger,
+        .print                  = event_enable_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_enable_trigger_free,
+};
+static struct event_trigger_ops event_disable_count_trigger_ops = {
+        .func                   = event_enable_count_trigger,
+        .print                  = event_enable_trigger_print,
+        .init                   = event_trigger_init,
+        .free                   = event_enable_trigger_free,
+};
+static int
+event_enable_trigger_func(struct event_command *cmd_ops,
+                          struct ftrace_event_file *file,
+                          char *glob, char *cmd, char *param)
+{
+        struct ftrace_event_file *event_enable_file;
+        struct enable_trigger_data *enable_data;
+        struct event_trigger_data *trigger_data;
+        struct event_trigger_ops *trigger_ops;
+        struct trace_array *tr = file->tr;
+        const char *system;
+        const char *event;
+        char *trigger;
+        char *number;
+        bool enable;
+        int ret;
+        if (!param)
+                return -EINVAL;
+        /* separate the trigger from the filter (s:e:n [if filter]) */
+        trigger = strsep(&param, " \t");
+        if (!trigger)
+                return -EINVAL;
+        system = strsep(&trigger, ":");
+        if (!trigger)
+                return -EINVAL;
+        event = strsep(&trigger, ":");
+        ret = -EINVAL;
+        event_enable_file = find_event_file(tr, system, event);
+        if (!event_enable_file)
+                goto out;
+        enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+        trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+        ret = -ENOMEM;
+        trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+        if (!trigger_data)
+                goto out;
+        enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
+        if (!enable_data) {
+                kfree(trigger_data);
+                goto out;
+        }
+        trigger_data->count = -1;
+        trigger_data->ops = trigger_ops;
+        trigger_data->cmd_ops = cmd_ops;
+        INIT_LIST_HEAD(&trigger_data->list);
+        RCU_INIT_POINTER(trigger_data->filter, NULL);
+        enable_data->enable = enable;
+        enable_data->file = event_enable_file;
+        trigger_data->private_data = enable_data;
+        if (glob[0] == '!') {
+                cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+                kfree(trigger_data);
+                kfree(enable_data);
+                ret = 0;
+                goto out;
+        }
+        if (trigger) {
+                number = strsep(&trigger, ":");
+                ret = -EINVAL;
+                if (!strlen(number))
+                        goto out_free;
+                /*
+                 * We use the callback data field (which is a pointer)
+                 * as our counter.
+                 */
+                ret = kstrtoul(number, 0, &trigger_data->count);
+                if (ret)
+                        goto out_free;
+        }
+        if (!param) /* if param is non-empty, it's supposed to be a filter */
+                goto out_reg;
+        if (!cmd_ops->set_filter)
+                goto out_reg;
+        ret = cmd_ops->set_filter(param, trigger_data, file);
+        if (ret < 0)
+                goto out_free;
+ out_reg:
+        /* Don't let event modules unload while probe registered */
+        ret = try_module_get(event_enable_file->event_call->mod);
+        if (!ret) {
+                ret = -EBUSY;
+                goto out_free;
+        }
+        ret = trace_event_enable_disable(event_enable_file, 1, 1);
+        if (ret < 0)
+                goto out_put;
+        ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+        /*
+         * The above returns on success the # of functions enabled,
+         * but if it didn't find any functions it returns zero.
+         * Consider no functions a failure too.
+         */
+        if (!ret) {
+                ret = -ENOENT;
+                goto out_disable;
+        } else if (ret < 0)
+                goto out_disable;
+        /* Just return zero, not the number of enabled functions */
+        ret = 0;
+ out:
+        return ret;
+ out_disable:
+        trace_event_enable_disable(event_enable_file, 0, 1);
+ out_put:
+        module_put(event_enable_file->event_call->mod);
+ out_free:
+        if (cmd_ops->set_filter)
+                cmd_ops->set_filter(NULL, trigger_data, NULL);
+        kfree(trigger_data);
+        kfree(enable_data);
+        goto out;
+}
+static int event_enable_register_trigger(char *glob,
+                                         struct event_trigger_ops *ops,
+                                         struct event_trigger_data *data,
+                                         struct ftrace_event_file *file)
+{
+        struct enable_trigger_data *enable_data = data->private_data;
+        struct enable_trigger_data *test_enable_data;
+        struct event_trigger_data *test;
+        int ret = 0;
+        list_for_each_entry_rcu(test, &file->triggers, list) {
+                test_enable_data = test->private_data;
+                if (test_enable_data &&
+                    (test_enable_data->file == enable_data->file)) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+        }
+        if (data->ops->init) {
+                ret = data->ops->init(data->ops, data);
+                if (ret < 0)
+                        goto out;
+        }
+        list_add_rcu(&data->list, &file->triggers);
+        ret++;
+        if (trace_event_trigger_enable_disable(file, 1) < 0) {
+                list_del_rcu(&data->list);
+                ret--;
+        }
+        update_cond_flag(file);
+out:
+        return ret;
+}
+static void event_enable_unregister_trigger(char *glob,
+                                            struct event_trigger_ops *ops,
+                                            struct event_trigger_data *test,
+                                            struct ftrace_event_file *file)
+{
+        struct enable_trigger_data *test_enable_data = test->private_data;
+        struct enable_trigger_data *enable_data;
+        struct event_trigger_data *data;
+        bool unregistered = false;
+        list_for_each_entry_rcu(data, &file->triggers, list) {
+                enable_data = data->private_data;
+                if (enable_data &&
+                    (enable_data->file == test_enable_data->file)) {
+                        unregistered = true;
+                        list_del_rcu(&data->list);
+                        update_cond_flag(file);
+                        trace_event_trigger_enable_disable(file, 0);
+                        break;
+                }
+        }
+        if (unregistered && data->ops->free)
+                data->ops->free(data->ops, data);
+}
+static struct event_trigger_ops *
+event_enable_get_trigger_ops(char *cmd, char *param)
+{
+        struct event_trigger_ops *ops;
+        bool enable;
+        enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+        if (enable)
+                ops = param ? &event_enable_count_trigger_ops :
+                        &event_enable_trigger_ops;
+        else
+                ops = param ? &event_disable_count_trigger_ops :
+                        &event_disable_trigger_ops;
+        return ops;
+}
+static struct event_command trigger_enable_cmd = {
+        .name                   = ENABLE_EVENT_STR,
+        .trigger_type           = ETT_EVENT_ENABLE,
+        .func                   = event_enable_trigger_func,
+        .reg                    = event_enable_register_trigger,
+        .unreg                  = event_enable_unregister_trigger,
+        .get_trigger_ops        = event_enable_get_trigger_ops,
+        .set_filter             = set_trigger_filter,
+};
+static struct event_command trigger_disable_cmd = {
+        .name                   = DISABLE_EVENT_STR,
+        .trigger_type           = ETT_EVENT_ENABLE,
+        .func                   = event_enable_trigger_func,
+        .reg                    = event_enable_register_trigger,
+        .unreg                  = event_enable_unregister_trigger,
+        .get_trigger_ops        = event_enable_get_trigger_ops,
+        .set_filter             = set_trigger_filter,
+};
+static __init void unregister_trigger_enable_disable_cmds(void)
+{
+        unregister_event_command(&trigger_enable_cmd);
+        unregister_event_command(&trigger_disable_cmd);
+}
+static __init int register_trigger_enable_disable_cmds(void)
+{
+        int ret;
+        ret = register_event_command(&trigger_enable_cmd);
+        if (WARN_ON(ret < 0))
+                return ret;
+        ret = register_event_command(&trigger_disable_cmd);
+        if (WARN_ON(ret < 0))
+                unregister_trigger_enable_disable_cmds();
+        return ret;
+}
+static __init int register_trigger_traceon_traceoff_cmds(void)
+{
+        int ret;
+        ret = register_event_command(&trigger_traceon_cmd);
+        if (WARN_ON(ret < 0))
+                return ret;
+        ret = register_event_command(&trigger_traceoff_cmd);
+        if (WARN_ON(ret < 0))
+                unregister_trigger_traceon_traceoff_cmds();
+        return ret;
+}
+__init int register_trigger_cmds(void)
+{
+        register_trigger_traceon_traceoff_cmds();
+        register_trigger_snapshot_cmd();
+        register_trigger_stacktrace_cmd();
+        register_trigger_enable_disable_cmds();
+        return 0;
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index dae9541ada9e..bdbae450c13e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -27,18 +27,12 @@
 /**
 * Kprobe event core functions
 */
-struct trace_probe {
+struct trace_kprobe {
        struct list_head        list;
        struct kretprobe        rp;     /* Use rp.kp for kprobe use */
        unsigned long           nhit;
-        unsigned int            flags;  /* For TP_FLAG_* */
        const char              *symbol;        /* symbol name */
-        struct ftrace_event_class       class;
+        struct trace_probe      tp;
-        struct ftrace_event_call        call;
-        struct list_head        files;
-        ssize_t                 size;           /* trace entry size */
-        unsigned int            nr_args;
-        struct probe_arg        args[];
 };
 struct event_file_link {
@@ -46,56 +40,46 @@ struct event_file_link {
        struct list_head                list;
 };
-#define SIZEOF_TRACE_PROBE(n)                   \
+#define SIZEOF_TRACE_KPROBE(n)                          \
-        (offsetof(struct trace_probe, args) +   \
+        (offsetof(struct trace_kprobe, tp.args) +       \
        (sizeof(struct probe_arg) * (n)))
-static __kprobes bool trace_probe_is_return(struct trace_probe *tp)
+static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
-        return tp->rp.handler != NULL;
+        return tk->rp.handler != NULL;
 }
-static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
+static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk)
 {
-        return tp->symbol ? tp->symbol : "unknown";
+        return tk->symbol ? tk->symbol : "unknown";
 }
-static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
+static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
 {
-        return tp->rp.kp.offset;
+        return tk->rp.kp.offset;
 }
-static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
+static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk)
 {
-        return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
+        return !!(kprobe_gone(&tk->rp.kp));
 }
-static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
+static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
-{
+                                                 struct module *mod)
-        return !!(tp->flags & TP_FLAG_REGISTERED);
-}
-static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
-{
-        return !!(kprobe_gone(&tp->rp.kp));
-}
-static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
-                                                struct module *mod)
 {
        int len = strlen(mod->name);
-        const char *name = trace_probe_symbol(tp);
+        const char *name = trace_kprobe_symbol(tk);
        return strncmp(mod->name, name, len) == 0 && name[len] == ':';
 }
-static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
+static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
 {
-        return !!strchr(trace_probe_symbol(tp), ':');
+        return !!strchr(trace_kprobe_symbol(tk), ':');
 }
-static int register_probe_event(struct trace_probe *tp);
+static int register_kprobe_event(struct trace_kprobe *tk);
-static int unregister_probe_event(struct trace_probe *tp);
+static int unregister_kprobe_event(struct trace_kprobe *tk);
 static DEFINE_MUTEX(probe_lock);
 static LIST_HEAD(probe_list);
@@ -104,45 +88,224 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_dispatcher(struct kretprobe_instance *ri,
                                struct pt_regs *regs);
+/* Memory fetching by symbol */
+struct symbol_cache {
+        char            *symbol;
+        long            offset;
+        unsigned long   addr;
+};
+unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+        sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+        if (sc->addr)
+                sc->addr += sc->offset;
+        return sc->addr;
+}
+void free_symbol_cache(struct symbol_cache *sc)
+{
+        kfree(sc->symbol);
+        kfree(sc);
+}
+struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+        struct symbol_cache *sc;
+        if (!sym || strlen(sym) == 0)
+                return NULL;
+        sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+        if (!sc)
+                return NULL;
+        sc->symbol = kstrdup(sym, GFP_KERNEL);
+        if (!sc->symbol) {
+                kfree(sc);
+                return NULL;
+        }
+        sc->offset = offset;
+        update_symbol_cache(sc);
+        return sc;
+}
+/*
+ * Kprobes-specific fetch functions
+ */
+#define DEFINE_FETCH_stack(type)                                        \
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+                                          void *offset, void *dest)     \
+{                                                                       \
+        *(type *)dest = (type)regs_get_kernel_stack_nth(regs,           \
+                                (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string      NULL
+#define fetch_stack_string_size NULL
+#define DEFINE_FETCH_memory(type)                                       \
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+                                          void *addr, void *dest)       \
+{                                                                       \
+        type retval;                                                    \
+        if (probe_kernel_address(addr, retval))                         \
+                *(type *)dest = 0;                                      \
+        else                                                            \
+                *(type *)dest = retval;                                 \
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+                                                      void *addr, void *dest)
+{
+        long ret;
+        int maxlen = get_rloc_len(*(u32 *)dest);
+        u8 *dst = get_rloc_data(dest);
+        u8 *src = addr;
+        mm_segment_t old_fs = get_fs();
+        if (!maxlen)
+                return;
+        /*
+         * Try to get string again, since the string can be changed while
+         * probing.
+         */
+        set_fs(KERNEL_DS);
+        pagefault_disable();
+        do
+                ret = __copy_from_user_inatomic(dst++, src++, 1);
+        while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+        dst[-1] = '\0';
+        pagefault_enable();
+        set_fs(old_fs);
+        if (ret < 0) {  /* Failed to fetch string */
+                ((u8 *)get_rloc_data(dest))[0] = '\0';
+                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+        } else {
+                *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+                                              get_rloc_offs(*(u32 *)dest));
+        }
+}
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+                                                        void *addr, void *dest)
+{
+        mm_segment_t old_fs;
+        int ret, len = 0;
+        u8 c;
+        old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        pagefault_disable();
+        do {
+                ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+                len++;
+        } while (c && ret == 0 && len < MAX_STRING_SIZE);
+        pagefault_enable();
+        set_fs(old_fs);
+        if (ret < 0)    /* Failed to check the length */
+                *(u32 *)dest = 0;
+        else
+                *(u32 *)dest = len;
+}
+#define DEFINE_FETCH_symbol(type)                                       \
+__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,      \
+                                          void *data, void *dest)       \
+{                                                                       \
+        struct symbol_cache *sc = data;                                 \
+        if (sc->addr)                                                   \
+                fetch_memory_##type(regs, (void *)sc->addr, dest);      \
+        else                                                            \
+                *(type *)dest = 0;                                      \
+}
+DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
+/* kprobes don't support file_offset fetch methods */
+#define fetch_file_offset_u8            NULL
+#define fetch_file_offset_u16           NULL
+#define fetch_file_offset_u32           NULL
+#define fetch_file_offset_u64           NULL
+#define fetch_file_offset_string        NULL
+#define fetch_file_offset_string_size   NULL
+/* Fetch type information table */
+const struct fetch_type kprobes_fetch_type_table[] = {
+        /* Special types */
+        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+                                        sizeof(u32), 1, "__data_loc char[]"),
+        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+                                        string_size, sizeof(u32), 0, "u32"),
+        /* Basic types */
+        ASSIGN_FETCH_TYPE(u8,  u8,  0),
+        ASSIGN_FETCH_TYPE(u16, u16, 0),
+        ASSIGN_FETCH_TYPE(u32, u32, 0),
+        ASSIGN_FETCH_TYPE(u64, u64, 0),
+        ASSIGN_FETCH_TYPE(s8,  u8,  1),
+        ASSIGN_FETCH_TYPE(s16, u16, 1),
+        ASSIGN_FETCH_TYPE(s32, u32, 1),
+        ASSIGN_FETCH_TYPE(s64, u64, 1),
+        ASSIGN_FETCH_TYPE_END
+};
 /*
 * Allocate new trace_probe and initialize it (including kprobes).
 */
-static struct trace_probe *alloc_trace_probe(const char *group,
+static struct trace_kprobe *alloc_trace_kprobe(const char *group,
                                             const char *event,
                                             void *addr,
                                             const char *symbol,
                                             unsigned long offs,
                                             int nargs, bool is_return)
 {
-        struct trace_probe *tp;
+        struct trace_kprobe *tk;
        int ret = -ENOMEM;
-        tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
+        tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL);
-        if (!tp)
+        if (!tk)
                return ERR_PTR(ret);
        if (symbol) {
-                tp->symbol = kstrdup(symbol, GFP_KERNEL);
+                tk->symbol = kstrdup(symbol, GFP_KERNEL);
-                if (!tp->symbol)
+                if (!tk->symbol)
                        goto error;
-                tp->rp.kp.symbol_name = tp->symbol;
+                tk->rp.kp.symbol_name = tk->symbol;
-                tp->rp.kp.offset = offs;
+                tk->rp.kp.offset = offs;
        } else
-                tp->rp.kp.addr = addr;
+                tk->rp.kp.addr = addr;
        if (is_return)
-                tp->rp.handler = kretprobe_dispatcher;
+                tk->rp.handler = kretprobe_dispatcher;
        else
-                tp->rp.kp.pre_handler = kprobe_dispatcher;
+                tk->rp.kp.pre_handler = kprobe_dispatcher;
        if (!event || !is_good_name(event)) {
                ret = -EINVAL;
                goto error;
        }
-        tp->call.class = &tp->class;
+        tk->tp.call.class = &tk->tp.class;
-        tp->call.name = kstrdup(event, GFP_KERNEL);
+        tk->tp.call.name = kstrdup(event, GFP_KERNEL);
-        if (!tp->call.name)
+        if (!tk->tp.call.name)
                goto error;
        if (!group || !is_good_name(group)) {
@@ -150,42 +313,42 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                goto error;
        }
-        tp->class.system = kstrdup(group, GFP_KERNEL);
+        tk->tp.class.system = kstrdup(group, GFP_KERNEL);
-        if (!tp->class.system)
+        if (!tk->tp.class.system)
                goto error;
-        INIT_LIST_HEAD(&tp->list);
+        INIT_LIST_HEAD(&tk->list);
-        INIT_LIST_HEAD(&tp->files);
+        INIT_LIST_HEAD(&tk->tp.files);
-        return tp;
+        return tk;
 error:
-        kfree(tp->call.name);
+        kfree(tk->tp.call.name);
-        kfree(tp->symbol);
+        kfree(tk->symbol);
-        kfree(tp);
+        kfree(tk);
        return ERR_PTR(ret);
 }
-static void free_trace_probe(struct trace_probe *tp)
+static void free_trace_kprobe(struct trace_kprobe *tk)
 {
        int i;
-        for (i = 0; i < tp->nr_args; i++)
+        for (i = 0; i < tk->tp.nr_args; i++)
-                traceprobe_free_probe_arg(&tp->args[i]);
+                traceprobe_free_probe_arg(&tk->tp.args[i]);
-        kfree(tp->call.class->system);
+        kfree(tk->tp.call.class->system);
-        kfree(tp->call.name);
+        kfree(tk->tp.call.name);
-        kfree(tp->symbol);
+        kfree(tk->symbol);
-        kfree(tp);
+        kfree(tk);
 }
-static struct trace_probe *find_trace_probe(const char *event,
+static struct trace_kprobe *find_trace_kprobe(const char *event,
-                                            const char *group)
+                                              const char *group)
 {
-        struct trace_probe *tp;
+        struct trace_kprobe *tk;
-        list_for_each_entry(tp, &probe_list, list)
+        list_for_each_entry(tk, &probe_list, list)
-                if (strcmp(tp->call.name, event) == 0 &&
+                if (strcmp(tk->tp.call.name, event) == 0 &&
-                    strcmp(tp->call.class->system, group) == 0)
+                    strcmp(tk->tp.call.class->system, group) == 0)
-                        return tp;
+                        return tk;
        return NULL;
 }
@@ -194,7 +357,7 @@ static struct trace_probe *find_trace_probe(const char *event,
 * if the file is NULL, enable "perf" handler, or enable "trace" handler.
 */
 static int
-enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
+enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
 {
        int ret = 0;
@@ -208,17 +371,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
                }
                link->file = file;
-                list_add_tail_rcu(&link->list, &tp->files);
+                list_add_tail_rcu(&link->list, &tk->tp.files);
-                tp->flags |= TP_FLAG_TRACE;
+                tk->tp.flags |= TP_FLAG_TRACE;
        } else
-                tp->flags |= TP_FLAG_PROFILE;
+                tk->tp.flags |= TP_FLAG_PROFILE;
-        if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) {
+        if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
-                if (trace_probe_is_return(tp))
+                if (trace_kprobe_is_return(tk))
-                        ret = enable_kretprobe(&tp->rp);
+                        ret = enable_kretprobe(&tk->rp);
                else
-                        ret = enable_kprobe(&tp->rp.kp);
+                        ret = enable_kprobe(&tk->rp.kp);
        }
 out:
        return ret;
@@ -241,14 +404,14 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
 * if the file is NULL, disable "perf" handler, or disable "trace" handler.
 */
 static int
-disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
+disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
 {
        struct event_file_link *link = NULL;
        int wait = 0;
        int ret = 0;
        if (file) {
-                link = find_event_file_link(tp, file);
+                link = find_event_file_link(&tk->tp, file);
                if (!link) {
                        ret = -EINVAL;
                        goto out;
@@ -256,18 +419,18 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
                list_del_rcu(&link->list);
                wait = 1;
-                if (!list_empty(&tp->files))
+                if (!list_empty(&tk->tp.files))
                        goto out;
-                tp->flags &= ~TP_FLAG_TRACE;
+                tk->tp.flags &= ~TP_FLAG_TRACE;
        } else
-                tp->flags &= ~TP_FLAG_PROFILE;
+                tk->tp.flags &= ~TP_FLAG_PROFILE;
-        if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
+        if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) {
-                if (trace_probe_is_return(tp))
+                if (trace_kprobe_is_return(tk))
-                        disable_kretprobe(&tp->rp);
+                        disable_kretprobe(&tk->rp);
                else
-                        disable_kprobe(&tp->rp.kp);
+                        disable_kprobe(&tk->rp.kp);
                wait = 1;
        }
 out:
@@ -288,40 +451,40 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
 }
 /* Internal register function - just handle k*probes and flags */
-static int __register_trace_probe(struct trace_probe *tp)
+static int __register_trace_kprobe(struct trace_kprobe *tk)
 {
        int i, ret;
-        if (trace_probe_is_registered(tp))
+        if (trace_probe_is_registered(&tk->tp))
                return -EINVAL;
-        for (i = 0; i < tp->nr_args; i++)
+        for (i = 0; i < tk->tp.nr_args; i++)
-                traceprobe_update_arg(&tp->args[i]);
+                traceprobe_update_arg(&tk->tp.args[i]);
        /* Set/clear disabled flag according to tp->flag */
-        if (trace_probe_is_enabled(tp))
+        if (trace_probe_is_enabled(&tk->tp))
-                tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
+                tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
        else
-                tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+                tk->rp.kp.flags |= KPROBE_FLAG_DISABLED;
-        if (trace_probe_is_return(tp))
+        if (trace_kprobe_is_return(tk))
-                ret = register_kretprobe(&tp->rp);
+                ret = register_kretprobe(&tk->rp);
        else
-                ret = register_kprobe(&tp->rp.kp);
+                ret = register_kprobe(&tk->rp.kp);
        if (ret == 0)
-                tp->flags |= TP_FLAG_REGISTERED;
+                tk->tp.flags |= TP_FLAG_REGISTERED;
        else {
                pr_warning("Could not insert probe at %s+%lu: %d\n",
-                           trace_probe_symbol(tp), trace_probe_offset(tp), ret);
+                           trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
-                if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
+                if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
                        pr_warning("This probe might be able to register after"
                                   "target module is loaded. Continue.\n");
                        ret = 0;
                } else if (ret == -EILSEQ) {
                        pr_warning("Probing address(0x%p) is not an "
                                   "instruction boundary.\n",
-                                   tp->rp.kp.addr);
+                                   tk->rp.kp.addr);
                        ret = -EINVAL;
                }
        }
@@ -330,67 +493,67 @@ static int __register_trace_probe(struct trace_probe *tp)
 }
 /* Internal unregister function - just handle k*probes and flags */
-static void __unregister_trace_probe(struct trace_probe *tp)
+static void __unregister_trace_kprobe(struct trace_kprobe *tk)
 {
-        if (trace_probe_is_registered(tp)) {
+        if (trace_probe_is_registered(&tk->tp)) {
-                if (trace_probe_is_return(tp))
+                if (trace_kprobe_is_return(tk))
-                        unregister_kretprobe(&tp->rp);
+                        unregister_kretprobe(&tk->rp);
                else
-                        unregister_kprobe(&tp->rp.kp);
+                        unregister_kprobe(&tk->rp.kp);
-                tp->flags &= ~TP_FLAG_REGISTERED;
+                tk->tp.flags &= ~TP_FLAG_REGISTERED;
                /* Cleanup kprobe for reuse */
-                if (tp->rp.kp.symbol_name)
+                if (tk->rp.kp.symbol_name)
-                        tp->rp.kp.addr = NULL;
+                        tk->rp.kp.addr = NULL;
        }
 }
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
-static int unregister_trace_probe(struct trace_probe *tp)
+static int unregister_trace_kprobe(struct trace_kprobe *tk)
 {
        /* Enabled event can not be unregistered */
-        if (trace_probe_is_enabled(tp))
+        if (trace_probe_is_enabled(&tk->tp))
                return -EBUSY;
        /* Will fail if probe is being used by ftrace or perf */
-        if (unregister_probe_event(tp))
+        if (unregister_kprobe_event(tk))
                return -EBUSY;
-        __unregister_trace_probe(tp);
+        __unregister_trace_kprobe(tk);
-        list_del(&tp->list);
+        list_del(&tk->list);
        return 0;
 }
 /* Register a trace_probe and probe_event */
-static int register_trace_probe(struct trace_probe *tp)
+static int register_trace_kprobe(struct trace_kprobe *tk)
 {
-        struct trace_probe *old_tp;
+        struct trace_kprobe *old_tk;
        int ret;
        mutex_lock(&probe_lock);
        /* Delete old (same name) event if exist */
-        old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
+        old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system);
-        if (old_tp) {
+        if (old_tk) {
-                ret = unregister_trace_probe(old_tp);
+                ret = unregister_trace_kprobe(old_tk);
                if (ret < 0)
                        goto end;
-                free_trace_probe(old_tp);
+                free_trace_kprobe(old_tk);
        }
        /* Register new event */
-        ret = register_probe_event(tp);
+        ret = register_kprobe_event(tk);
        if (ret) {
                pr_warning("Failed to register probe event(%d)\n", ret);
                goto end;
        }
        /* Register k*probe */
-        ret = __register_trace_probe(tp);
+        ret = __register_trace_kprobe(tk);
        if (ret < 0)
-                unregister_probe_event(tp);
+                unregister_kprobe_event(tk);
        else
-                list_add_tail(&tp->list, &probe_list);
+                list_add_tail(&tk->list, &probe_list);
 end:
        mutex_unlock(&probe_lock);
@@ -398,11 +561,11 @@ end:
 }
 /* Module notifier call back, checking event on the module */
-static int trace_probe_module_callback(struct notifier_block *nb,
+static int trace_kprobe_module_callback(struct notifier_block *nb,
                                       unsigned long val, void *data)
 {
        struct module *mod = data;
-        struct trace_probe *tp;
+        struct trace_kprobe *tk;
        int ret;
        if (val != MODULE_STATE_COMING)
@@ -410,15 +573,15 @@ static int trace_probe_module_callback(struct notifier_block *nb,
        /* Update probes on coming module */
        mutex_lock(&probe_lock);
-        list_for_each_entry(tp, &probe_list, list) {
+        list_for_each_entry(tk, &probe_list, list) {
-                if (trace_probe_within_module(tp, mod)) {
+                if (trace_kprobe_within_module(tk, mod)) {
                        /* Don't need to check busy - this should have gone. */
-                        __unregister_trace_probe(tp);
+                        __unregister_trace_kprobe(tk);
-                        ret = __register_trace_probe(tp);
+                        ret = __register_trace_kprobe(tk);
                        if (ret)
                                pr_warning("Failed to re-register probe %s on"
                                           "%s: %d\n",
-                                           tp->call.name, mod->name, ret);
+                                           tk->tp.call.name, mod->name, ret);
                }
        }
        mutex_unlock(&probe_lock);
@@ -426,12 +589,12 @@ static int trace_probe_module_callback(struct notifier_block *nb,
        return NOTIFY_DONE;
 }
-static struct notifier_block trace_probe_module_nb = {
+static struct notifier_block trace_kprobe_module_nb = {
-        .notifier_call = trace_probe_module_callback,
+        .notifier_call = trace_kprobe_module_callback,
        .priority = 1   /* Invoked after kprobe module callback */
 };
-static int create_trace_probe(int argc, char **argv)
+static int create_trace_kprobe(int argc, char **argv)
 {
        /*
         * Argument syntax:
@@ -451,7 +614,7 @@ static int create_trace_probe(int argc, char **argv)
         * Type of args:
         *  FETCHARG:TYPE : use TYPE instead of unsigned long.
         */
-        struct trace_probe *tp;
+        struct trace_kprobe *tk;
        int i, ret = 0;
        bool is_return = false, is_delete = false;
        char *symbol = NULL, *event = NULL, *group = NULL;
@@ -498,16 +661,16 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
                mutex_lock(&probe_lock);
-                tp = find_trace_probe(event, group);
+                tk = find_trace_kprobe(event, group);
-                if (!tp) {
+                if (!tk) {
                        mutex_unlock(&probe_lock);
                        pr_info("Event %s/%s doesn't exist.\n", group, event);
                        return -ENOENT;
                }
                /* delete an event */
-                ret = unregister_trace_probe(tp);
+                ret = unregister_trace_kprobe(tk);
                if (ret == 0)
-                        free_trace_probe(tp);
+                        free_trace_kprobe(tk);
                mutex_unlock(&probe_lock);
                return ret;
        }
@@ -554,47 +717,49 @@ static int create_trace_probe(int argc, char **argv)
                                 is_return ? 'r' : 'p', addr);
                event = buf;
        }
-        tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+        tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
                               is_return);
-        if (IS_ERR(tp)) {
+        if (IS_ERR(tk)) {
                pr_info("Failed to allocate trace_probe.(%d)\n",
-                        (int)PTR_ERR(tp));
+                        (int)PTR_ERR(tk));
-                return PTR_ERR(tp);
+                return PTR_ERR(tk);
        }
        /* parse arguments */
        ret = 0;
        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+                struct probe_arg *parg = &tk->tp.args[i];
                /* Increment count for freeing args in error case */
-                tp->nr_args++;
+                tk->tp.nr_args++;
                /* Parse argument name */
                arg = strchr(argv[i], '=');
                if (arg) {
                        *arg++ = '\0';
-                        tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+                        parg->name = kstrdup(argv[i], GFP_KERNEL);
                } else {
                        arg = argv[i];
                        /* If argument name is omitted, set "argN" */
                        snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
-                        tp->args[i].name = kstrdup(buf, GFP_KERNEL);
+                        parg->name = kstrdup(buf, GFP_KERNEL);
                }
-                if (!tp->args[i].name) {
+                if (!parg->name) {
                        pr_info("Failed to allocate argument[%d] name.\n", i);
                        ret = -ENOMEM;
                        goto error;
                }
-                if (!is_good_name(tp->args[i].name)) {
+                if (!is_good_name(parg->name)) {
                        pr_info("Invalid argument[%d] name: %s\n",
-                                i, tp->args[i].name);
+                                i, parg->name);
                        ret = -EINVAL;
                        goto error;
                }
-                if (traceprobe_conflict_field_name(tp->args[i].name,
+                if (traceprobe_conflict_field_name(parg->name,
-                                                        tp->args, i)) {
+                                                        tk->tp.args, i)) {
                        pr_info("Argument[%d] name '%s' conflicts with "
                                "another field.\n", i, argv[i]);
                        ret = -EINVAL;
@@ -602,7 +767,7 @@ static int create_trace_probe(int argc, char **argv)
                }
                /* Parse fetch argument */
-                ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i],
+                ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
                                                is_return, true);
                if (ret) {
                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
@@ -610,35 +775,35 @@ static int create_trace_probe(int argc, char **argv)
                }
        }
-        ret = register_trace_probe(tp);
+        ret = register_trace_kprobe(tk);
        if (ret)
                goto error;
        return 0;
 error:
-        free_trace_probe(tp);
+        free_trace_kprobe(tk);
        return ret;
 }
-static int release_all_trace_probes(void)
+static int release_all_trace_kprobes(void)
 {
-        struct trace_probe *tp;
+        struct trace_kprobe *tk;
        int ret = 0;
        mutex_lock(&probe_lock);
        /* Ensure no probe is in use. */
-        list_for_each_entry(tp, &probe_list, list)
+        list_for_each_entry(tk, &probe_list, list)
-                if (trace_probe_is_enabled(tp)) {
+                if (trace_probe_is_enabled(&tk->tp)) {
                        ret = -EBUSY;
                        goto end;
                }
        /* TODO: Use batch unregistration */
        while (!list_empty(&probe_list)) {
-                tp = list_entry(probe_list.next, struct trace_probe, list);
+                tk = list_entry(probe_list.next, struct trace_kprobe, list);
-                ret = unregister_trace_probe(tp);
+                ret = unregister_trace_kprobe(tk);
                if (ret)
                        goto end;
-                free_trace_probe(tp);
+                free_trace_kprobe(tk);
        }
 end:
@@ -666,22 +831,22 @@ static void probes_seq_stop(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 {
-        struct trace_probe *tp = v;
+        struct trace_kprobe *tk = v;
        int i;
-        seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
+        seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
-        seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
+        seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name);
-        if (!tp->symbol)
+        if (!tk->symbol)
-                seq_printf(m, " 0x%p", tp->rp.kp.addr);
+                seq_printf(m, " 0x%p", tk->rp.kp.addr);
-        else if (tp->rp.kp.offset)
+        else if (tk->rp.kp.offset)
-                seq_printf(m, " %s+%u", trace_probe_symbol(tp),
+                seq_printf(m, " %s+%u", trace_kprobe_symbol(tk),
-                           tp->rp.kp.offset);
+                           tk->rp.kp.offset);
        else
-                seq_printf(m, " %s", trace_probe_symbol(tp));
+                seq_printf(m, " %s", trace_kprobe_symbol(tk));
-        for (i = 0; i < tp->nr_args; i++)
+        for (i = 0; i < tk->tp.nr_args; i++)
-                seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
+                seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
        seq_printf(m, "\n");
        return 0;
@@ -699,7 +864,7 @@ static int probes_open(struct inode *inode, struct file *file)
        int ret;
        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
-                ret = release_all_trace_probes();
+                ret = release_all_trace_kprobes();
                if (ret < 0)
                        return ret;
        }
@@ -711,7 +876,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer,
                            size_t count, loff_t *ppos)
 {
        return traceprobe_probes_write(file, buffer, count, ppos,
-                        create_trace_probe);
+                        create_trace_kprobe);
 }
 static const struct file_operations kprobe_events_ops = {
@@ -726,10 +891,10 @@ static const struct file_operations kprobe_events_ops = {
 /* Probes profiling interfaces */
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
-        struct trace_probe *tp = v;
+        struct trace_kprobe *tk = v;
-        seq_printf(m, "  %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
+        seq_printf(m, "  %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit,
-                   tp->rp.kp.nmissed);
+                   tk->rp.kp.nmissed);
        return 0;
 }
@@ -754,57 +919,9 @@ static const struct file_operations kprobe_profile_ops = {
        .release        = seq_release,
 };
-/* Sum up total data length for dynamic arraies (strings) */
-static __kprobes int __get_data_size(struct trace_probe *tp,
-                                     struct pt_regs *regs)
-{
-        int i, ret = 0;
-        u32 len;
-        for (i = 0; i < tp->nr_args; i++)
-                if (unlikely(tp->args[i].fetch_size.fn)) {
-                        call_fetch(&tp->args[i].fetch_size, regs, &len);
-                        ret += len;
-                }
-        return ret;
-}
-/* Store the value of each argument */
-static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
-                                       struct pt_regs *regs,
-                                       u8 *data, int maxlen)
-{
-        int i;
-        u32 end = tp->size;
-        u32 *dl;        /* Data (relative) location */
-        for (i = 0; i < tp->nr_args; i++) {
-                if (unlikely(tp->args[i].fetch_size.fn)) {
-                        /*
-                         * First, we set the relative location and
-                         * maximum data length to *dl
-                         */
-                        dl = (u32 *)(data + tp->args[i].offset);
-                        *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
-                        /* Then try to fetch string or dynamic array data */
-                        call_fetch(&tp->args[i].fetch, regs, dl);
-                        /* Reduce maximum length */
-                        end += get_rloc_len(*dl);
-                        maxlen -= get_rloc_len(*dl);
-                        /* Trick here, convert data_rloc to data_loc */
-                        *dl = convert_rloc_to_loc(*dl,
-                                 ent_size + tp->args[i].offset);
-                } else
-                        /* Just fetching data normally */
-                        call_fetch(&tp->args[i].fetch, regs,
-                                   data + tp->args[i].offset);
-        }
-}
 /* Kprobe handler */
 static __kprobes void
-__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
+__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
                    struct ftrace_event_file *ftrace_file)
 {
        struct kprobe_trace_entry_head *entry;
@@ -812,18 +929,18 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
        struct ring_buffer *buffer;
        int size, dsize, pc;
        unsigned long irq_flags;
-        struct ftrace_event_call *call = &tp->call;
+        struct ftrace_event_call *call = &tk->tp.call;
        WARN_ON(call != ftrace_file->event_call);
-        if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+        if (ftrace_trigger_soft_disabled(ftrace_file))
                return;
        local_save_flags(irq_flags);
        pc = preempt_count();
-        dsize = __get_data_size(tp, regs);
+        dsize = __get_data_size(&tk->tp, regs);
-        size = sizeof(*entry) + tp->size + dsize;
+        size = sizeof(*entry) + tk->tp.size + dsize;
        event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
                                                call->event.type,
@@ -832,26 +949,25 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
                return;
        entry = ring_buffer_event_data(event);
-        entry->ip = (unsigned long)tp->rp.kp.addr;
+        entry->ip = (unsigned long)tk->rp.kp.addr;
-        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+        store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-        if (!filter_check_discard(ftrace_file, entry, buffer, event))
+        event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
-                trace_buffer_unlock_commit_regs(buffer, event,
+                                         entry, irq_flags, pc, regs);
-                                                irq_flags, pc, regs);
 }
 static __kprobes void
-kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
+kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
        struct event_file_link *link;
-        list_for_each_entry_rcu(link, &tp->files, list)
+        list_for_each_entry_rcu(link, &tk->tp.files, list)
-                __kprobe_trace_func(tp, regs, link->file);
+                __kprobe_trace_func(tk, regs, link->file);
 }
 /* Kretprobe handler */
 static __kprobes void
-__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                       struct pt_regs *regs,
                       struct ftrace_event_file *ftrace_file)
 {
@@ -860,18 +976,18 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
        struct ring_buffer *buffer;
        int size, pc, dsize;
        unsigned long irq_flags;
-        struct ftrace_event_call *call = &tp->call;
+        struct ftrace_event_call *call = &tk->tp.call;
        WARN_ON(call != ftrace_file->event_call);
-        if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+        if (ftrace_trigger_soft_disabled(ftrace_file))
                return;
        local_save_flags(irq_flags);
        pc = preempt_count();
-        dsize = __get_data_size(tp, regs);
+        dsize = __get_data_size(&tk->tp, regs);
-        size = sizeof(*entry) + tp->size + dsize;
+        size = sizeof(*entry) + tk->tp.size + dsize;
        event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
                                                call->event.type,
@@ -880,23 +996,22 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
                return;
        entry = ring_buffer_event_data(event);
-        entry->func = (unsigned long)tp->rp.kp.addr;
+        entry->func = (unsigned long)tk->rp.kp.addr;
        entry->ret_ip = (unsigned long)ri->ret_addr;
-        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+        store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-        if (!filter_check_discard(ftrace_file, entry, buffer, event))
+        event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
-                trace_buffer_unlock_commit_regs(buffer, event,
+                                         entry, irq_flags, pc, regs);
-                                                irq_flags, pc, regs);
 }
 static __kprobes void
-kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                     struct pt_regs *regs)
 {
        struct event_file_link *link;
-        list_for_each_entry_rcu(link, &tp->files, list)
+        list_for_each_entry_rcu(link, &tk->tp.files, list)
-                __kretprobe_trace_func(tp, ri, regs, link->file);
+                __kretprobe_trace_func(tk, ri, regs, link->file);
 }
 /* Event entry printers */
@@ -983,16 +1098,18 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
        int ret, i;
        struct kprobe_trace_entry_head field;
-        struct trace_probe *tp = (struct trace_probe *)event_call->data;
+        struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
        /* Set argument names as fields */
-        for (i = 0; i < tp->nr_args; i++) {
+        for (i = 0; i < tk->tp.nr_args; i++) {
-                ret = trace_define_field(event_call, tp->args[i].type->fmttype,
+                struct probe_arg *parg = &tk->tp.args[i];
-                                         tp->args[i].name,
-                                         sizeof(field) + tp->args[i].offset,
+                ret = trace_define_field(event_call, parg->type->fmttype,
-                                         tp->args[i].type->size,
+                                         parg->name,
-                                         tp->args[i].type->is_signed,
+                                         sizeof(field) + parg->offset,
+                                         parg->type->size,
+                                         parg->type->is_signed,
                                         FILTER_OTHER);
                if (ret)
                        return ret;
@@ -1004,17 +1121,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
        int ret, i;
        struct kretprobe_trace_entry_head field;
-        struct trace_probe *tp = (struct trace_probe *)event_call->data;
+        struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
        DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
        DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
        /* Set argument names as fields */
-        for (i = 0; i < tp->nr_args; i++) {
+        for (i = 0; i < tk->tp.nr_args; i++) {
-                ret = trace_define_field(event_call, tp->args[i].type->fmttype,
+                struct probe_arg *parg = &tk->tp.args[i];
-                                         tp->args[i].name,
-                                         sizeof(field) + tp->args[i].offset,
+                ret = trace_define_field(event_call, parg->type->fmttype,
-                                         tp->args[i].type->size,
+                                         parg->name,
-                                         tp->args[i].type->is_signed,
+                                         sizeof(field) + parg->offset,
+                                         parg->type->size,
+                                         parg->type->is_signed,
                                         FILTER_OTHER);
                if (ret)
                        return ret;
@@ -1022,74 +1141,13 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
        return 0;
 }
-static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
-{
-        int i;
-        int pos = 0;
-        const char *fmt, *arg;
-        if (!trace_probe_is_return(tp)) {
-                fmt = "(%lx)";
-                arg = "REC->" FIELD_STRING_IP;
-        } else {
-                fmt = "(%lx <- %lx)";
-                arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
-        }
-        /* When len=0, we just calculate the needed length */
-#define LEN_OR_ZERO (len ? len - pos : 0)
-        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
-        for (i = 0; i < tp->nr_args; i++) {
-                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
-                                tp->args[i].name, tp->args[i].type->fmt);
-        }
-        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
-        for (i = 0; i < tp->nr_args; i++) {
-                if (strcmp(tp->args[i].type->name, "string") == 0)
-                        pos += snprintf(buf + pos, LEN_OR_ZERO,
-                                        ", __get_str(%s)",
-                                        tp->args[i].name);
-                else
-                        pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
-                                        tp->args[i].name);
-        }
-#undef LEN_OR_ZERO
-        /* return the length of print_fmt */
-        return pos;
-}
-static int set_print_fmt(struct trace_probe *tp)
-{
-        int len;
-        char *print_fmt;
-        /* First: called with 0 length to calculate the needed length */
-        len = __set_print_fmt(tp, NULL, 0);
-        print_fmt = kmalloc(len + 1, GFP_KERNEL);
-        if (!print_fmt)
-                return -ENOMEM;
-        /* Second: actually write the @print_fmt */
-        __set_print_fmt(tp, print_fmt, len + 1);
-        tp->call.print_fmt = print_fmt;
-        return 0;
-}
 #ifdef CONFIG_PERF_EVENTS
 /* Kprobe profile handler */
 static __kprobes void
-kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
+kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
-        struct ftrace_event_call *call = &tp->call;
+        struct ftrace_event_call *call = &tk->tp.call;
        struct kprobe_trace_entry_head *entry;
        struct hlist_head *head;
        int size, __size, dsize;
@@ -1099,8 +1157,8 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
        if (hlist_empty(head))
                return;
-        dsize = __get_data_size(tp, regs);
+        dsize = __get_data_size(&tk->tp, regs);
-        __size = sizeof(*entry) + tp->size + dsize;
+        __size = sizeof(*entry) + tk->tp.size + dsize;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
@@ -1108,18 +1166,18 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
        if (!entry)
                return;
-        entry->ip = (unsigned long)tp->rp.kp.addr;
+        entry->ip = (unsigned long)tk->rp.kp.addr;
        memset(&entry[1], 0, dsize);
-        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+        store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
        perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
 }
 /* Kretprobe profile handler */
 static __kprobes void
-kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                    struct pt_regs *regs)
 {
-        struct ftrace_event_call *call = &tp->call;
+        struct ftrace_event_call *call = &tk->tp.call;
        struct kretprobe_trace_entry_head *entry;
        struct hlist_head *head;
        int size, __size, dsize;
@@ -1129,8 +1187,8 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
        if (hlist_empty(head))
                return;
-        dsize = __get_data_size(tp, regs);
+        dsize = __get_data_size(&tk->tp, regs);
-        __size = sizeof(*entry) + tp->size + dsize;
+        __size = sizeof(*entry) + tk->tp.size + dsize;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
@@ -1138,9 +1196,9 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
        if (!entry)
                return;
-        entry->func = (unsigned long)tp->rp.kp.addr;
+        entry->func = (unsigned long)tk->rp.kp.addr;
        entry->ret_ip = (unsigned long)ri->ret_addr;
-        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+        store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
        perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
 }
 #endif  /* CONFIG_PERF_EVENTS */
@@ -1155,20 +1213,20 @@ static __kprobes
 int kprobe_register(struct ftrace_event_call *event,
                    enum trace_reg type, void *data)
 {
-        struct trace_probe *tp = (struct trace_probe *)event->data;
+        struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
        struct ftrace_event_file *file = data;
        switch (type) {
        case TRACE_REG_REGISTER:
-                return enable_trace_probe(tp, file);
+                return enable_trace_kprobe(tk, file);
        case TRACE_REG_UNREGISTER:
-                return disable_trace_probe(tp, file);
+                return disable_trace_kprobe(tk, file);
 #ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
-                return enable_trace_probe(tp, NULL);
+                return enable_trace_kprobe(tk, NULL);
        case TRACE_REG_PERF_UNREGISTER:
-                return disable_trace_probe(tp, NULL);
+                return disable_trace_kprobe(tk, NULL);
        case TRACE_REG_PERF_OPEN:
        case TRACE_REG_PERF_CLOSE:
        case TRACE_REG_PERF_ADD:
@@ -1182,15 +1240,15 @@ int kprobe_register(struct ftrace_event_call *event,
 static __kprobes
 int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
-        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+        struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
-        tp->nhit++;
+        tk->nhit++;
-        if (tp->flags & TP_FLAG_TRACE)
+        if (tk->tp.flags & TP_FLAG_TRACE)
-                kprobe_trace_func(tp, regs);
+                kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
-        if (tp->flags & TP_FLAG_PROFILE)
+        if (tk->tp.flags & TP_FLAG_PROFILE)
-                kprobe_perf_func(tp, regs);
+                kprobe_perf_func(tk, regs);
 #endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1198,15 +1256,15 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 static __kprobes
 int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
-        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+        struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
-        tp->nhit++;
+        tk->nhit++;
-        if (tp->flags & TP_FLAG_TRACE)
+        if (tk->tp.flags & TP_FLAG_TRACE)
-                kretprobe_trace_func(tp, ri, regs);
+                kretprobe_trace_func(tk, ri, regs);
 #ifdef CONFIG_PERF_EVENTS
-        if (tp->flags & TP_FLAG_PROFILE)
+        if (tk->tp.flags & TP_FLAG_PROFILE)
-                kretprobe_perf_func(tp, ri, regs);
+                kretprobe_perf_func(tk, ri, regs);
 #endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1219,21 +1277,21 @@ static struct trace_event_functions kprobe_funcs = {
        .trace          = print_kprobe_event
 };
-static int register_probe_event(struct trace_probe *tp)
+static int register_kprobe_event(struct trace_kprobe *tk)
 {
-        struct ftrace_event_call *call = &tp->call;
+        struct ftrace_event_call *call = &tk->tp.call;
        int ret;
        /* Initialize ftrace_event_call */
        INIT_LIST_HEAD(&call->class->fields);
-        if (trace_probe_is_return(tp)) {
+        if (trace_kprobe_is_return(tk)) {
                call->event.funcs = &kretprobe_funcs;
                call->class->define_fields = kretprobe_event_define_fields;
        } else {
                call->event.funcs = &kprobe_funcs;
                call->class->define_fields = kprobe_event_define_fields;
        }
-        if (set_print_fmt(tp) < 0)
+        if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
                return -ENOMEM;
        ret = register_ftrace_event(&call->event);
        if (!ret) {
@@ -1242,7 +1300,7 @@ static int register_probe_event(struct trace_probe *tp)
        }
        call->flags = 0;
        call->class->reg = kprobe_register;
-        call->data = tp;
+        call->data = tk;
        ret = trace_add_event_call(call);
        if (ret) {
                pr_info("Failed to register kprobe event: %s\n", call->name);
@@ -1252,14 +1310,14 @@ static int register_probe_event(struct trace_probe *tp)
        return ret;
 }
-static int unregister_probe_event(struct trace_probe *tp)
+static int unregister_kprobe_event(struct trace_kprobe *tk)
 {
        int ret;
        /* tp->event is unregistered in trace_remove_event_call() */
-        ret = trace_remove_event_call(&tp->call);
+        ret = trace_remove_event_call(&tk->tp.call);
        if (!ret)
-                kfree(tp->call.print_fmt);
+                kfree(tk->tp.call.print_fmt);
        return ret;
 }
@@ -1269,7 +1327,7 @@ static __init int init_kprobe_trace(void)
        struct dentry *d_tracer;
        struct dentry *entry;
-        if (register_module_notifier(&trace_probe_module_nb))
+        if (register_module_notifier(&trace_kprobe_module_nb))
                return -EINVAL;
        d_tracer = tracing_init_dentry();
@@ -1309,26 +1367,26 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
 }
 static struct ftrace_event_file *
-find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)
+find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
 {
        struct ftrace_event_file *file;
        list_for_each_entry(file, &tr->events, list)
-                if (file->event_call == &tp->call)
+                if (file->event_call == &tk->tp.call)
                        return file;
        return NULL;
 }
 /*
- * Nobody but us can call enable_trace_probe/disable_trace_probe at this
+ * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this
 * stage, we can do this lockless.
 */
 static __init int kprobe_trace_self_tests_init(void)
 {
        int ret, warn = 0;
        int (*target)(int, int, int, int, int, int);
-        struct trace_probe *tp;
+        struct trace_kprobe *tk;
        struct ftrace_event_file *file;
        target = kprobe_trace_selftest_target;
@@ -1337,44 +1395,44 @@ static __init int kprobe_trace_self_tests_init(void)
        ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
                                  "$stack $stack0 +0($stack)",
-                                  create_trace_probe);
+                                  create_trace_kprobe);
        if (WARN_ON_ONCE(ret)) {
                pr_warn("error on probing function entry.\n");
                warn++;
        } else {
                /* Enable trace point */
-                tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
+                tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
-                if (WARN_ON_ONCE(tp == NULL)) {
+                if (WARN_ON_ONCE(tk == NULL)) {
                        pr_warn("error on getting new probe.\n");
                        warn++;
                } else {
-                        file = find_trace_probe_file(tp, top_trace_array());
+                        file = find_trace_probe_file(tk, top_trace_array());
                        if (WARN_ON_ONCE(file == NULL)) {
                                pr_warn("error on getting probe file.\n");
                                warn++;
                        } else
-                                enable_trace_probe(tp, file);
+                                enable_trace_kprobe(tk, file);
                }
        }
        ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
-                                  "$retval", create_trace_probe);
+                                  "$retval", create_trace_kprobe);
        if (WARN_ON_ONCE(ret)) {
                pr_warn("error on probing function return.\n");
                warn++;
        } else {
                /* Enable trace point */
-                tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
+                tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
-                if (WARN_ON_ONCE(tp == NULL)) {
+                if (WARN_ON_ONCE(tk == NULL)) {
                        pr_warn("error on getting 2nd new probe.\n");
                        warn++;
                } else {
-                        file = find_trace_probe_file(tp, top_trace_array());
+                        file = find_trace_probe_file(tk, top_trace_array());
                        if (WARN_ON_ONCE(file == NULL)) {
                                pr_warn("error on getting probe file.\n");
                                warn++;
                        } else
-                                enable_trace_probe(tp, file);
+                                enable_trace_kprobe(tk, file);
                }
        }
@@ -1384,46 +1442,46 @@ static __init int kprobe_trace_self_tests_init(void)
        ret = target(1, 2, 3, 4, 5, 6);
        /* Disable trace points before removing it */
-        tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
+        tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
-        if (WARN_ON_ONCE(tp == NULL)) {
+        if (WARN_ON_ONCE(tk == NULL)) {
                pr_warn("error on getting test probe.\n");
                warn++;
        } else {
-                file = find_trace_probe_file(tp, top_trace_array());
+                file = find_trace_probe_file(tk, top_trace_array());
                if (WARN_ON_ONCE(file == NULL)) {
                        pr_warn("error on getting probe file.\n");
                        warn++;
                } else
-                        disable_trace_probe(tp, file);
+                        disable_trace_kprobe(tk, file);
        }
-        tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
+        tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
-        if (WARN_ON_ONCE(tp == NULL)) {
+        if (WARN_ON_ONCE(tk == NULL)) {
                pr_warn("error on getting 2nd test probe.\n");
                warn++;
        } else {
-                file = find_trace_probe_file(tp, top_trace_array());
+                file = find_trace_probe_file(tk, top_trace_array());
                if (WARN_ON_ONCE(file == NULL)) {
                        pr_warn("error on getting probe file.\n");
                        warn++;
                } else
-                        disable_trace_probe(tp, file);
+                        disable_trace_kprobe(tk, file);
        }
-        ret = traceprobe_command("-:testprobe", create_trace_probe);
+        ret = traceprobe_command("-:testprobe", create_trace_kprobe);
        if (WARN_ON_ONCE(ret)) {
                pr_warn("error on deleting a probe.\n");
                warn++;
        }
-        ret = traceprobe_command("-:testprobe2", create_trace_probe);
+        ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
        if (WARN_ON_ONCE(ret)) {
                pr_warn("error on deleting a probe.\n");
                warn++;
        }
 end:
-        release_all_trace_probes();
+        release_all_trace_kprobes();
        if (warn)
                pr_cont("NG: Some tests are failed. Please check them.\n");
        else
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 412e959709b4..8364a421b4df 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -35,46 +35,27 @@ const char *reserved_field_names[] = {
        FIELD_STRING_FUNC,
 };
-/* Printing function type */
-#define PRINT_TYPE_FUNC_NAME(type)      print_type_##type
-#define PRINT_TYPE_FMT_NAME(type)       print_type_format_##type
 /* Printing  in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)                   \
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt)                         \
-static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,    \
+__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,   \
                                                const char *name,       \
-                                                void *data, void *ent)\
+                                                void *data, void *ent)  \
 {                                                                       \
-        return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+        return trace_seq_printf(s, " %s=" fmt, name, *(type *)data);    \
 }                                                                       \
-static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
-static inline void *get_rloc_data(u32 *dl)
-{
-        return (u8 *)dl + get_rloc_offs(*dl);
-}
-/* For data_loc conversion */
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
-static inline void *get_loc_data(u32 *dl, void *ent)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
-{
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
-        return (u8 *)ent + get_rloc_offs(*dl);
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
-}
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8,  "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
-/* For defining macros, define string/string_size types */
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
-typedef u32 string;
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
-typedef u32 string_size;
 /* Print type function for string type */
-static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
                                                  const char *name,
                                                  void *data, void *ent)
 {
@@ -87,18 +68,7 @@ static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
                                        (const char *)get_loc_data(data, ent));
 }
-static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
-#define FETCH_FUNC_NAME(method, type)   fetch_##method##_##type
-/*
- * Define macro for basic types - we don't need to define s* types, because
- * we have to care only about bitwidth at recording time.
- */
-#define DEFINE_BASIC_FETCH_FUNCS(method) \
-DEFINE_FETCH_##method(u8)               \
-DEFINE_FETCH_##method(u16)              \
-DEFINE_FETCH_##method(u32)              \
-DEFINE_FETCH_##method(u64)
 #define CHECK_FETCH_FUNCS(method, fn)                   \
        (((FETCH_FUNC_NAME(method, u8) == fn) ||        \
@@ -111,7 +81,7 @@ DEFINE_FETCH_##method(u64)
 /* Data fetch function templates */
 #define DEFINE_FETCH_reg(type)                                          \
-static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,  \
+__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,         \
                                        void *offset, void *dest)       \
 {                                                                       \
        *(type *)dest = (type)regs_get_register(regs,                   \
@@ -122,20 +92,8 @@ DEFINE_BASIC_FETCH_FUNCS(reg)
 #define fetch_reg_string        NULL
 #define fetch_reg_string_size   NULL
-#define DEFINE_FETCH_stack(type)                                        \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
-                                          void *offset, void *dest)     \
-{                                                                       \
-        *(type *)dest = (type)regs_get_kernel_stack_nth(regs,           \
-                                (unsigned int)((unsigned long)offset)); \
-}
-DEFINE_BASIC_FETCH_FUNCS(stack)
-/* No string on the stack entry */
-#define fetch_stack_string      NULL
-#define fetch_stack_string_size NULL
 #define DEFINE_FETCH_retval(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,      \
                                          void *dummy, void *dest)      \
 {                                                                       \
        *(type *)dest = (type)regs_return_value(regs);                  \
@@ -145,150 +103,16 @@ DEFINE_BASIC_FETCH_FUNCS(retval)
 #define fetch_retval_string             NULL
 #define fetch_retval_string_size        NULL
-#define DEFINE_FETCH_memory(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
-                                          void *addr, void *dest)       \
-{                                                                       \
-        type retval;                                                    \
-        if (probe_kernel_address(addr, retval))                         \
-                *(type *)dest = 0;                                      \
-        else                                                            \
-                *(type *)dest = retval;                                 \
-}
-DEFINE_BASIC_FETCH_FUNCS(memory)
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
- * length and relative data location.
- */
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
-                                                      void *addr, void *dest)
-{
-        long ret;
-        int maxlen = get_rloc_len(*(u32 *)dest);
-        u8 *dst = get_rloc_data(dest);
-        u8 *src = addr;
-        mm_segment_t old_fs = get_fs();
-        if (!maxlen)
-                return;
-        /*
-         * Try to get string again, since the string can be changed while
-         * probing.
-         */
-        set_fs(KERNEL_DS);
-        pagefault_disable();
-        do
-                ret = __copy_from_user_inatomic(dst++, src++, 1);
-        while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-        dst[-1] = '\0';
-        pagefault_enable();
-        set_fs(old_fs);
-        if (ret < 0) {  /* Failed to fetch string */
-                ((u8 *)get_rloc_data(dest))[0] = '\0';
-                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
-        } else {
-                *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
-                                              get_rloc_offs(*(u32 *)dest));
-        }
-}
-/* Return the length of string -- including null terminal byte */
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
-                                                        void *addr, void *dest)
-{
-        mm_segment_t old_fs;
-        int ret, len = 0;
-        u8 c;
-        old_fs = get_fs();
-        set_fs(KERNEL_DS);
-        pagefault_disable();
-        do {
-                ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
-                len++;
-        } while (c && ret == 0 && len < MAX_STRING_SIZE);
-        pagefault_enable();
-        set_fs(old_fs);
-        if (ret < 0)    /* Failed to check the length */
-                *(u32 *)dest = 0;
-        else
-                *(u32 *)dest = len;
-}
-/* Memory fetching by symbol */
-struct symbol_cache {
-        char            *symbol;
-        long            offset;
-        unsigned long   addr;
-};
-static unsigned long update_symbol_cache(struct symbol_cache *sc)
-{
-        sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
-        if (sc->addr)
-                sc->addr += sc->offset;
-        return sc->addr;
-}
-static void free_symbol_cache(struct symbol_cache *sc)
-{
-        kfree(sc->symbol);
-        kfree(sc);
-}
-static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
-{
-        struct symbol_cache *sc;
-        if (!sym || strlen(sym) == 0)
-                return NULL;
-        sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
-        if (!sc)
-                return NULL;
-        sc->symbol = kstrdup(sym, GFP_KERNEL);
-        if (!sc->symbol) {
-                kfree(sc);
-                return NULL;
-        }
-        sc->offset = offset;
-        update_symbol_cache(sc);
-        return sc;
-}
-#define DEFINE_FETCH_symbol(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
-                                          void *data, void *dest)       \
-{                                                                       \
-        struct symbol_cache *sc = data;                                 \
-        if (sc->addr)                                                   \
-                fetch_memory_##type(regs, (void *)sc->addr, dest);      \
-        else                                                            \
-                *(type *)dest = 0;                                      \
-}
-DEFINE_BASIC_FETCH_FUNCS(symbol)
-DEFINE_FETCH_symbol(string)
-DEFINE_FETCH_symbol(string_size)
 /* Dereference memory access function */
 struct deref_fetch_param {
        struct fetch_param      orig;
        long                    offset;
+        fetch_func_t            fetch;
+        fetch_func_t            fetch_size;
 };
 #define DEFINE_FETCH_deref(type)                                        \
-static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,       \
                                            void *data, void *dest)     \
 {                                                                       \
        struct deref_fetch_param *dprm = data;                          \
@@ -296,13 +120,26 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
        call_fetch(&dprm->orig, regs, &addr);                           \
        if (addr) {                                                     \
                addr += dprm->offset;                                   \
-                fetch_memory_##type(regs, (void *)addr, dest);          \
+                dprm->fetch(regs, (void *)addr, dest);                  \
        } else                                                          \
                *(type *)dest = 0;                                      \
 }
 DEFINE_BASIC_FETCH_FUNCS(deref)
 DEFINE_FETCH_deref(string)
-DEFINE_FETCH_deref(string_size)
+__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
+                                                   void *data, void *dest)
+{
+        struct deref_fetch_param *dprm = data;
+        unsigned long addr;
+        call_fetch(&dprm->orig, regs, &addr);
+        if (addr && dprm->fetch_size) {
+                addr += dprm->offset;
+                dprm->fetch_size(regs, (void *)addr, dest);
+        } else
+                *(string_size *)dest = 0;
+}
 static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
 {
@@ -329,7 +166,7 @@ struct bitfield_fetch_param {
 };
 #define DEFINE_FETCH_bitfield(type)                                     \
-static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,    \
                                            void *data, void *dest)     \
 {                                                                       \
        struct bitfield_fetch_param *bprm = data;                       \
@@ -374,58 +211,8 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
        kfree(data);
 }
-/* Default (unsigned long) fetch type */
+static const struct fetch_type *find_fetch_type(const char *type,
-#define __DEFAULT_FETCH_TYPE(t) u##t
+                                                const struct fetch_type *ftbl)
-#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
-#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
-#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
-#define ASSIGN_FETCH_FUNC(method, type) \
-        [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
-#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
-        {.name = _name,                         \
-         .size = _size,                                 \
-         .is_signed = sign,                             \
-         .print = PRINT_TYPE_FUNC_NAME(ptype),          \
-         .fmt = PRINT_TYPE_FMT_NAME(ptype),             \
-         .fmttype = _fmttype,                           \
-         .fetch = {                                     \
-ASSIGN_FETCH_FUNC(reg, ftype),                          \
-ASSIGN_FETCH_FUNC(stack, ftype),                        \
-ASSIGN_FETCH_FUNC(retval, ftype),                       \
-ASSIGN_FETCH_FUNC(memory, ftype),                       \
-ASSIGN_FETCH_FUNC(symbol, ftype),                       \
-ASSIGN_FETCH_FUNC(deref, ftype),                        \
-ASSIGN_FETCH_FUNC(bitfield, ftype),                     \
-          }                                             \
-        }
-#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                   \
-        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
-#define FETCH_TYPE_STRING       0
-#define FETCH_TYPE_STRSIZE      1
-/* Fetch type information table */
-static const struct fetch_type fetch_type_table[] = {
-        /* Special types */
-        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
-                                        sizeof(u32), 1, "__data_loc char[]"),
-        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
-                                        string_size, sizeof(u32), 0, "u32"),
-        /* Basic types */
-        ASSIGN_FETCH_TYPE(u8,  u8,  0),
-        ASSIGN_FETCH_TYPE(u16, u16, 0),
-        ASSIGN_FETCH_TYPE(u32, u32, 0),
-        ASSIGN_FETCH_TYPE(u64, u64, 0),
-        ASSIGN_FETCH_TYPE(s8,  u8,  1),
-        ASSIGN_FETCH_TYPE(s16, u16, 1),
-        ASSIGN_FETCH_TYPE(s32, u32, 1),
-        ASSIGN_FETCH_TYPE(s64, u64, 1),
-};
-static const struct fetch_type *find_fetch_type(const char *type)
 {
        int i;
@@ -446,44 +233,52 @@ static const struct fetch_type *find_fetch_type(const char *type)
                switch (bs) {
                case 8:
-                        return find_fetch_type("u8");
+                        return find_fetch_type("u8", ftbl);
                case 16:
-                        return find_fetch_type("u16");
+                        return find_fetch_type("u16", ftbl);
                case 32:
-                        return find_fetch_type("u32");
+                        return find_fetch_type("u32", ftbl);
                case 64:
-                        return find_fetch_type("u64");
+                        return find_fetch_type("u64", ftbl);
                default:
                        goto fail;
                }
        }
-        for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+        for (i = 0; ftbl[i].name; i++) {
-                if (strcmp(type, fetch_type_table[i].name) == 0)
+                if (strcmp(type, ftbl[i].name) == 0)
-                        return &fetch_type_table[i];
+                        return &ftbl[i];
+        }
 fail:
        return NULL;
 }
 /* Special function : only accept unsigned long */
-static __kprobes void fetch_stack_address(struct pt_regs *regs,
+static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs,
-                                        void *dummy, void *dest)
+                                                 void *dummy, void *dest)
 {
        *(unsigned long *)dest = kernel_stack_pointer(regs);
 }
+static __kprobes void fetch_user_stack_address(struct pt_regs *regs,
+                                               void *dummy, void *dest)
+{
+        *(unsigned long *)dest = user_stack_pointer(regs);
+}
 static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
-                                        fetch_func_t orig_fn)
+                                            fetch_func_t orig_fn,
+                                            const struct fetch_type *ftbl)
 {
        int i;
-        if (type != &fetch_type_table[FETCH_TYPE_STRING])
+        if (type != &ftbl[FETCH_TYPE_STRING])
                return NULL;    /* Only string type needs size function */
        for (i = 0; i < FETCH_MTD_END; i++)
                if (type->fetch[i] == orig_fn)
-                        return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+                        return ftbl[FETCH_TYPE_STRSIZE].fetch[i];
        WARN_ON(1);     /* This should not happen */
@@ -516,7 +311,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 static int parse_probe_vars(char *arg, const struct fetch_type *t,
-                            struct fetch_param *f, bool is_return)
+                            struct fetch_param *f, bool is_return,
+                            bool is_kprobe)
 {
        int ret = 0;
        unsigned long param;
@@ -528,13 +324,16 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
                        ret = -EINVAL;
        } else if (strncmp(arg, "stack", 5) == 0) {
                if (arg[5] == '\0') {
-                        if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+                        if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR))
-                                f->fn = fetch_stack_address;
+                                return -EINVAL;
+                        if (is_kprobe)
+                                f->fn = fetch_kernel_stack_address;
                        else
-                                ret = -EINVAL;
+                                f->fn = fetch_user_stack_address;
                } else if (isdigit(arg[5])) {
                        ret = kstrtoul(arg + 5, 10, &param);
-                        if (ret || param > PARAM_MAX_STACK)
+                        if (ret || (is_kprobe && param > PARAM_MAX_STACK))
                                ret = -EINVAL;
                        else {
                                f->fn = t->fetch[FETCH_MTD_stack];
@@ -552,20 +351,18 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 static int parse_probe_arg(char *arg, const struct fetch_type *t,
                     struct fetch_param *f, bool is_return, bool is_kprobe)
 {
+        const struct fetch_type *ftbl;
        unsigned long param;
        long offset;
        char *tmp;
-        int ret;
+        int ret = 0;
-        ret = 0;
-        /* Until uprobe_events supports only reg arguments */
+        ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
-        if (!is_kprobe && arg[0] != '%')
+        BUG_ON(ftbl == NULL);
-                return -EINVAL;
        switch (arg[0]) {
        case '$':
-                ret = parse_probe_vars(arg + 1, t, f, is_return);
+                ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
                break;
        case '%':       /* named register */
@@ -577,7 +374,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
                }
                break;
-        case '@':       /* memory or symbol */
+        case '@':       /* memory, file-offset or symbol */
                if (isdigit(arg[1])) {
                        ret = kstrtoul(arg + 1, 0, &param);
                        if (ret)
@@ -585,7 +382,22 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
                        f->fn = t->fetch[FETCH_MTD_memory];
                        f->data = (void *)param;
+                } else if (arg[1] == '+') {
+                        /* kprobes don't support file offsets */
+                        if (is_kprobe)
+                                return -EINVAL;
+                        ret = kstrtol(arg + 2, 0, &offset);
+                        if (ret)
+                                break;
+                        f->fn = t->fetch[FETCH_MTD_file_offset];
+                        f->data = (void *)offset;
                } else {
+                        /* uprobes don't support symbols */
+                        if (!is_kprobe)
+                                return -EINVAL;
                        ret = traceprobe_split_symbol_offset(arg + 1, &offset);
                        if (ret)
                                break;
@@ -616,7 +428,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
                        struct deref_fetch_param        *dprm;
                        const struct fetch_type         *t2;
-                        t2 = find_fetch_type(NULL);
+                        t2 = find_fetch_type(NULL, ftbl);
                        *tmp = '\0';
                        dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
@@ -624,6 +436,9 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
                                return -ENOMEM;
                        dprm->offset = offset;
+                        dprm->fetch = t->fetch[FETCH_MTD_memory];
+                        dprm->fetch_size = get_fetch_size_function(t,
+                                                        dprm->fetch, ftbl);
                        ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
                                                        is_kprobe);
                        if (ret)
@@ -685,9 +500,13 @@ static int __parse_bitfield_probe_arg(const char *bf,
 int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
                struct probe_arg *parg, bool is_return, bool is_kprobe)
 {
+        const struct fetch_type *ftbl;
        const char *t;
        int ret;
+        ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
+        BUG_ON(ftbl == NULL);
        if (strlen(arg) > MAX_ARGSTR_LEN) {
                pr_info("Argument is too long.: %s\n",  arg);
                return -ENOSPC;
@@ -702,7 +521,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
                arg[t - parg->comm] = '\0';
                t++;
        }
-        parg->type = find_fetch_type(t);
+        parg->type = find_fetch_type(t, ftbl);
        if (!parg->type) {
                pr_info("Unsupported type: %s\n", t);
                return -EINVAL;
@@ -716,7 +535,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
        if (ret >= 0) {
                parg->fetch_size.fn = get_fetch_size_function(parg->type,
-                                                              parg->fetch.fn);
+                                                              parg->fetch.fn,
+                                                              ftbl);
                parg->fetch_size.data = parg->fetch.data;
        }
@@ -837,3 +657,65 @@ out:
        return ret;
 }
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
+                           bool is_return)
+{
+        int i;
+        int pos = 0;
+        const char *fmt, *arg;
+        if (!is_return) {
+                fmt = "(%lx)";
+                arg = "REC->" FIELD_STRING_IP;
+        } else {
+                fmt = "(%lx <- %lx)";
+                arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+        }
+        /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
+        for (i = 0; i < tp->nr_args; i++) {
+                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+                                tp->args[i].name, tp->args[i].type->fmt);
+        }
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+        for (i = 0; i < tp->nr_args; i++) {
+                if (strcmp(tp->args[i].type->name, "string") == 0)
+                        pos += snprintf(buf + pos, LEN_OR_ZERO,
+                                        ", __get_str(%s)",
+                                        tp->args[i].name);
+                else
+                        pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+                                        tp->args[i].name);
+        }
+#undef LEN_OR_ZERO
+        /* return the length of print_fmt */
+        return pos;
+}
+int set_print_fmt(struct trace_probe *tp, bool is_return)
+{
+        int len;
+        char *print_fmt;
+        /* First: called with 0 length to calculate the needed length */
+        len = __set_print_fmt(tp, NULL, 0, is_return);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
+        /* Second: actually write the @print_fmt */
+        __set_print_fmt(tp, print_fmt, len + 1, is_return);
+        tp->call.print_fmt = print_fmt;
+        return 0;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5c7e09d10d74..b73574a5f429 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -81,6 +81,17 @@
 */
 #define convert_rloc_to_loc(dl, offs)   ((u32)(dl) + (offs))
+static inline void *get_rloc_data(u32 *dl)
+{
+        return (u8 *)dl + get_rloc_offs(*dl);
+}
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
+{
+        return (u8 *)ent + get_rloc_offs(*dl);
+}
 /* Data fetch function type */
 typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
 /* Printing function type */
@@ -95,6 +106,7 @@ enum {
        FETCH_MTD_symbol,
        FETCH_MTD_deref,
        FETCH_MTD_bitfield,
+        FETCH_MTD_file_offset,
        FETCH_MTD_END,
 };
@@ -115,6 +127,148 @@ struct fetch_param {
        void                    *data;
 };
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+#define PRINT_TYPE_FUNC_NAME(type)      print_type_##type
+#define PRINT_TYPE_FMT_NAME(type)       print_type_format_##type
+/* Printing  in basic type function template */
+#define DECLARE_BASIC_PRINT_TYPE_FUNC(type)                             \
+__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,           \
+                                         const char *name,              \
+                                         void *data, void *ent);        \
+extern const char PRINT_TYPE_FMT_NAME(type)[]
+DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(string);
+#define FETCH_FUNC_NAME(method, type)   fetch_##method##_##type
+/* Declare macro for basic types */
+#define DECLARE_FETCH_FUNC(method, type)                                \
+extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs,         \
+                                          void *data, void *dest)
+#define DECLARE_BASIC_FETCH_FUNCS(method)       \
+DECLARE_FETCH_FUNC(method, u8);                 \
+DECLARE_FETCH_FUNC(method, u16);                \
+DECLARE_FETCH_FUNC(method, u32);                \
+DECLARE_FETCH_FUNC(method, u64)
+DECLARE_BASIC_FETCH_FUNCS(reg);
+#define fetch_reg_string                        NULL
+#define fetch_reg_string_size                   NULL
+DECLARE_BASIC_FETCH_FUNCS(retval);
+#define fetch_retval_string                     NULL
+#define fetch_retval_string_size                NULL
+DECLARE_BASIC_FETCH_FUNCS(symbol);
+DECLARE_FETCH_FUNC(symbol, string);
+DECLARE_FETCH_FUNC(symbol, string_size);
+DECLARE_BASIC_FETCH_FUNCS(deref);
+DECLARE_FETCH_FUNC(deref, string);
+DECLARE_FETCH_FUNC(deref, string_size);
+DECLARE_BASIC_FETCH_FUNCS(bitfield);
+#define fetch_bitfield_string                   NULL
+#define fetch_bitfield_string_size              NULL
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8)               \
+DEFINE_FETCH_##method(u16)              \
+DEFINE_FETCH_##method(u32)              \
+DEFINE_FETCH_##method(u64)
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+#define ASSIGN_FETCH_FUNC(method, type) \
+        [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+        {.name = _name,                         \
+         .size = _size,                                 \
+         .is_signed = sign,                             \
+         .print = PRINT_TYPE_FUNC_NAME(ptype),          \
+         .fmt = PRINT_TYPE_FMT_NAME(ptype),             \
+         .fmttype = _fmttype,                           \
+         .fetch = {                                     \
+ASSIGN_FETCH_FUNC(reg, ftype),                          \
+ASSIGN_FETCH_FUNC(stack, ftype),                        \
+ASSIGN_FETCH_FUNC(retval, ftype),                       \
+ASSIGN_FETCH_FUNC(memory, ftype),                       \
+ASSIGN_FETCH_FUNC(symbol, ftype),                       \
+ASSIGN_FETCH_FUNC(deref, ftype),                        \
+ASSIGN_FETCH_FUNC(bitfield, ftype),                     \
+ASSIGN_FETCH_FUNC(file_offset, ftype),                  \
+          }                                             \
+        }
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                   \
+        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+#define ASSIGN_FETCH_TYPE_END {}
+#define FETCH_TYPE_STRING       0
+#define FETCH_TYPE_STRSIZE      1
+/*
+ * Fetch type information table.
+ * It's declared as a weak symbol due to conditional compilation.
+ */
+extern __weak const struct fetch_type kprobes_fetch_type_table[];
+extern __weak const struct fetch_type uprobes_fetch_type_table[];
+#ifdef CONFIG_KPROBE_EVENT
+struct symbol_cache;
+unsigned long update_symbol_cache(struct symbol_cache *sc);
+void free_symbol_cache(struct symbol_cache *sc);
+struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+#else
+/* uprobes do not support symbol fetch methods */
+#define fetch_symbol_u8                 NULL
+#define fetch_symbol_u16                NULL
+#define fetch_symbol_u32                NULL
+#define fetch_symbol_u64                NULL
+#define fetch_symbol_string             NULL
+#define fetch_symbol_string_size        NULL
+struct symbol_cache {
+};
+static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc)
+{
+        return 0;
+}
+static inline void __used free_symbol_cache(struct symbol_cache *sc)
+{
+}
+static inline struct symbol_cache * __used
+alloc_symbol_cache(const char *sym, long offset)
+{
+        return NULL;
+}
+#endif /* CONFIG_KPROBE_EVENT */
 struct probe_arg {
        struct fetch_param      fetch;
        struct fetch_param      fetch_size;
@@ -124,6 +278,26 @@ struct probe_arg {
        const struct fetch_type *type;  /* Type of this argument */
 };
+struct trace_probe {
+        unsigned int                    flags;  /* For TP_FLAG_* */
+        struct ftrace_event_class       class;
+        struct ftrace_event_call        call;
+        struct list_head                files;
+        ssize_t                         size;   /* trace entry size */
+        unsigned int                    nr_args;
+        struct probe_arg                args[];
+};
+static inline bool trace_probe_is_enabled(struct trace_probe *tp)
+{
+        return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
+}
+static inline bool trace_probe_is_registered(struct trace_probe *tp)
+{
+        return !!(tp->flags & TP_FLAG_REGISTERED);
+}
 static inline __kprobes void call_fetch(struct fetch_param *fprm,
                                 struct pt_regs *regs, void *dest)
 {
@@ -158,3 +332,53 @@ extern ssize_t traceprobe_probes_write(struct file *file,
                int (*createfn)(int, char**));
 extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
+/* Sum up total data length for dynamic arraies (strings) */
+static inline __kprobes int
+__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
+{
+        int i, ret = 0;
+        u32 len;
+        for (i = 0; i < tp->nr_args; i++)
+                if (unlikely(tp->args[i].fetch_size.fn)) {
+                        call_fetch(&tp->args[i].fetch_size, regs, &len);
+                        ret += len;
+                }
+        return ret;
+}
+/* Store the value of each argument */
+static inline __kprobes void
+store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
+                 u8 *data, int maxlen)
+{
+        int i;
+        u32 end = tp->size;
+        u32 *dl;        /* Data (relative) location */
+        for (i = 0; i < tp->nr_args; i++) {
+                if (unlikely(tp->args[i].fetch_size.fn)) {
+                        /*
+                         * First, we set the relative location and
+                         * maximum data length to *dl
+                         */
+                        dl = (u32 *)(data + tp->args[i].offset);
+                        *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
+                        /* Then try to fetch string or dynamic array data */
+                        call_fetch(&tp->args[i].fetch, regs, dl);
+                        /* Reduce maximum length */
+                        end += get_rloc_len(*dl);
+                        maxlen -= get_rloc_len(*dl);
+                        /* Trick here, convert data_rloc to data_loc */
+                        *dl = convert_rloc_to_loc(*dl,
+                                 ent_size + tp->args[i].offset);
+                } else
+                        /* Just fetching data normally */
+                        call_fetch(&tp->args[i].fetch, regs,
+                                   data + tp->args[i].offset);
+        }
+}
+extern int set_print_fmt(struct trace_probe *tp, bool is_return);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fee77e15d815..6e32635e5e57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -16,6 +16,7 @@
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
 #include <trace/events/sched.h>
 #include "trace.h"
@@ -27,6 +28,8 @@ static int			wakeup_cpu;
 static int                      wakeup_current_cpu;
 static unsigned                 wakeup_prio = -1;
 static int                      wakeup_rt;
+static int                      wakeup_dl;
+static int                      tracing_dl = 0;
 static arch_spinlock_t wakeup_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr)
 {
        wakeup_cpu = -1;
        wakeup_prio = -1;
+        tracing_dl = 0;
        if (wakeup_task)
                put_task_struct(wakeup_task);
@@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
        tracing_record_cmdline(p);
        tracing_record_cmdline(current);
-        if ((wakeup_rt && !rt_task(p)) ||
+        /*
-                        p->prio >= wakeup_prio ||
+         * Semantic is like this:
-                        p->prio >= current->prio)
+         *  - wakeup tracer handles all tasks in the system, independently
+         *    from their scheduling class;
+         *  - wakeup_rt tracer handles tasks belonging to sched_dl and
+         *    sched_rt class;
+         *  - wakeup_dl handles tasks belonging to sched_dl class only.
+         */
+        if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
+            (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
+            (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
                return;
        pc = preempt_count();
@@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
        arch_spin_lock(&wakeup_lock);
        /* check for races. */
-        if (!tracer_enabled || p->prio >= wakeup_prio)
+        if (!tracer_enabled || tracing_dl ||
+            (!dl_task(p) && p->prio >= wakeup_prio))
                goto out_locked;
        /* reset the trace */
@@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
        wakeup_current_cpu = wakeup_cpu;
        wakeup_prio = p->prio;
+        /*
+         * Once you start tracing a -deadline task, don't bother tracing
+         * another task until the first one wakes up.
+         */
+        if (dl_task(p))
+                tracing_dl = 1;
+        else
+                tracing_dl = 0;
        wakeup_task = p;
        get_task_struct(wakeup_task);
@@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr)
 static int wakeup_tracer_init(struct trace_array *tr)
 {
+        wakeup_dl = 0;
        wakeup_rt = 0;
        return __wakeup_tracer_init(tr);
 }
 static int wakeup_rt_tracer_init(struct trace_array *tr)
 {
+        wakeup_dl = 0;
        wakeup_rt = 1;
        return __wakeup_tracer_init(tr);
 }
+static int wakeup_dl_tracer_init(struct trace_array *tr)
+{
+        wakeup_dl = 1;
+        wakeup_rt = 0;
+        return __wakeup_tracer_init(tr);
+}
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
        int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
@@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly =
        .use_max_tr     = true,
 };
+static struct tracer wakeup_dl_tracer __read_mostly =
+{
+        .name           = "wakeup_dl",
+        .init           = wakeup_dl_tracer_init,
+        .reset          = wakeup_tracer_reset,
+        .start          = wakeup_tracer_start,
+        .stop           = wakeup_tracer_stop,
+        .wait_pipe      = poll_wait_pipe,
+        .print_max      = true,
+        .print_header   = wakeup_print_header,
+        .print_line     = wakeup_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = wakeup_set_flag,
+        .flag_changed   = wakeup_flag_changed,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest    = trace_selftest_startup_wakeup,
+#endif
+        .open           = wakeup_trace_open,
+        .close          = wakeup_trace_close,
+        .use_max_tr     = true,
+};
 __init static int init_wakeup_tracer(void)
 {
        int ret;
@@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void)
        if (ret)
                return ret;
+        ret = register_tracer(&wakeup_dl_tracer);
+        if (ret)
+                return ret;
        return 0;
 }
 core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a7329b7902f8..e98fca60974f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
 #ifdef CONFIG_SCHED_TRACER
 static int trace_wakeup_test_thread(void *data)
 {
-        /* Make this a RT thread, doesn't need to be too high */
+        /* Make this a -deadline thread */
-        static const struct sched_param param = { .sched_priority = 5 };
+        static const struct sched_attr attr = {
+                .sched_policy = SCHED_DEADLINE,
+                .sched_runtime = 100000ULL,
+                .sched_deadline = 10000000ULL,
+                .sched_period = 10000000ULL
+        };
        struct completion *x = data;
-        sched_setscheduler(current, SCHED_FIFO, &param);
+        sched_setattr(current, &attr);
        /* Make it know we have a new prio */
        complete(x);
@@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data)
        /* we are awake, now wait to disappear */
        while (!kthread_should_stop()) {
                /*
-                 * This is an RT task, do short sleeps to let
+                 * This will likely be the system top priority
-                 * others run.
+                 * task, do short sleeps to let others run.
                 */
                msleep(100);
        }
@@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 {
        unsigned long save_max = tracing_max_latency;
        struct task_struct *p;
-        struct completion isrt;
+        struct completion is_ready;
        unsigned long count;
        int ret;
-        init_completion(&isrt);
+        init_completion(&is_ready);
-        /* create a high prio thread */
+        /* create a -deadline thread */
-        p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+        p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
        if (IS_ERR(p)) {
                printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
                return -1;
        }
-        /* make sure the thread is running at an RT prio */
+        /* make sure the thread is running at -deadline policy */
-        wait_for_completion(&isrt);
+        wait_for_completion(&is_ready);
        /* start the tracing */
        ret = tracer_init(trace, tr);
@@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
        while (p->on_rq) {
                /*
-                 * Sleep to make sure the RT thread is asleep too.
+                 * Sleep to make sure the -deadline thread is asleep too.
                 * On virtual machines we can't rely on timings,
                 * but we want to make sure this test still works.
                 */
                msleep(100);
        }
-        init_completion(&isrt);
+        init_completion(&is_ready);
        wake_up_process(p);
        /* Wait for the task to wake up */
-        wait_for_completion(&isrt);
+        wait_for_completion(&is_ready);
        /* stop the tracing. */
        tracing_stop();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b20428c5efe2..e6be585cf06a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -382,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
        .open = stack_trace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
-        .llseek = ftrace_filter_lseek,
+        .llseek = tracing_lseek,
        .release = ftrace_regex_release,
 };
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e4b6d11bdf78..759d5e004517 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        if (!ftrace_file)
                return;
-        if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+        if (ftrace_trigger_soft_disabled(ftrace_file))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -343,9 +343,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        entry->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
-        if (!filter_check_discard(ftrace_file, entry, buffer, event))
+        event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
-                trace_current_buffer_unlock_commit(buffer, event,
+                                    irq_flags, pc);
-                                                   irq_flags, pc);
 }
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -369,7 +368,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        if (!ftrace_file)
                return;
-        if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+        if (ftrace_trigger_soft_disabled(ftrace_file))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -390,9 +389,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        entry->nr = syscall_nr;
        entry->ret = syscall_get_return_value(current, regs);
-        if (!filter_check_discard(ftrace_file, entry, buffer, event))
+        event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
-                trace_current_buffer_unlock_commit(buffer, event,
+                                    irq_flags, pc);
-                                                   irq_flags, pc);
 }
 static int reg_event_syscall_enter(struct ftrace_event_file *file,
@@ -431,11 +429,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
        if (!tr->sys_refcount_enter)
                unregister_trace_sys_enter(ftrace_syscall_enter, tr);
        mutex_unlock(&syscall_trace_lock);
-        /*
-         * Callers expect the event to be completely disabled on
-         * return, so wait for current handlers to finish.
-         */
-        synchronize_sched();
 }
 static int reg_event_syscall_exit(struct ftrace_event_file *file,
@@ -474,11 +467,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
        if (!tr->sys_refcount_exit)
                unregister_trace_sys_exit(ftrace_syscall_exit, tr);
        mutex_unlock(&syscall_trace_lock);
-        /*
-         * Callers expect the event to be completely disabled on
-         * return, so wait for current handlers to finish.
-         */
-        synchronize_sched();
 }
 static int __init init_syscall_trace(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b6dcc42ef7f5..79e52d93860b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -51,22 +51,17 @@ struct trace_uprobe_filter {
 */
 struct trace_uprobe {
        struct list_head                list;
-        struct ftrace_event_class       class;
-        struct ftrace_event_call        call;
        struct trace_uprobe_filter      filter;
        struct uprobe_consumer          consumer;
        struct inode                    *inode;
        char                            *filename;
        unsigned long                   offset;
        unsigned long                   nhit;
-        unsigned int                    flags;  /* For TP_FLAG_* */
+        struct trace_probe              tp;
-        ssize_t                         size;   /* trace entry size */
-        unsigned int                    nr_args;
-        struct probe_arg                args[];
 };
-#define SIZEOF_TRACE_UPROBE(n)                  \
+#define SIZEOF_TRACE_UPROBE(n)                          \
-        (offsetof(struct trace_uprobe, args) +  \
+        (offsetof(struct trace_uprobe, tp.args) +       \
        (sizeof(struct probe_arg) * (n)))
 static int register_uprobe_event(struct trace_uprobe *tu);
@@ -75,10 +70,151 @@ static int unregister_uprobe_event(struct trace_uprobe *tu);
 static DEFINE_MUTEX(uprobe_lock);
 static LIST_HEAD(uprobe_list);
+struct uprobe_dispatch_data {
+        struct trace_uprobe     *tu;
+        unsigned long           bp_addr;
+};
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
 static int uretprobe_dispatcher(struct uprobe_consumer *con,
                                unsigned long func, struct pt_regs *regs);
+#ifdef CONFIG_STACK_GROWSUP
+static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
+{
+        return addr - (n * sizeof(long));
+}
+#else
+static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
+{
+        return addr + (n * sizeof(long));
+}
+#endif
+static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+        unsigned long ret;
+        unsigned long addr = user_stack_pointer(regs);
+        addr = adjust_stack_addr(addr, n);
+        if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret)))
+                return 0;
+        return ret;
+}
+/*
+ * Uprobes-specific fetch functions
+ */
+#define DEFINE_FETCH_stack(type)                                        \
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+                                          void *offset, void *dest)     \
+{                                                                       \
+        *(type *)dest = (type)get_user_stack_nth(regs,                  \
+                                              ((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string      NULL
+#define fetch_stack_string_size NULL
+#define DEFINE_FETCH_memory(type)                                       \
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+                                                void *addr, void *dest) \
+{                                                                       \
+        type retval;                                                    \
+        void __user *vaddr = (void __force __user *) addr;              \
+                                                                        \
+        if (copy_from_user(&retval, vaddr, sizeof(type)))               \
+                *(type *)dest = 0;                                      \
+        else                                                            \
+                *(type *) dest = retval;                                \
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+                                                      void *addr, void *dest)
+{
+        long ret;
+        u32 rloc = *(u32 *)dest;
+        int maxlen  = get_rloc_len(rloc);
+        u8 *dst = get_rloc_data(dest);
+        void __user *src = (void __force __user *) addr;
+        if (!maxlen)
+                return;
+        ret = strncpy_from_user(dst, src, maxlen);
+        if (ret < 0) {  /* Failed to fetch string */
+                ((u8 *)get_rloc_data(dest))[0] = '\0';
+                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc));
+        } else {
+                *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc));
+        }
+}
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+                                                      void *addr, void *dest)
+{
+        int len;
+        void __user *vaddr = (void __force __user *) addr;
+        len = strnlen_user(vaddr, MAX_STRING_SIZE);
+        if (len == 0 || len > MAX_STRING_SIZE)  /* Failed to check length */
+                *(u32 *)dest = 0;
+        else
+                *(u32 *)dest = len;
+}
+static unsigned long translate_user_vaddr(void *file_offset)
+{
+        unsigned long base_addr;
+        struct uprobe_dispatch_data *udd;
+        udd = (void *) current->utask->vaddr;
+        base_addr = udd->bp_addr - udd->tu->offset;
+        return base_addr + (unsigned long)file_offset;
+}
+#define DEFINE_FETCH_file_offset(type)                                  \
+static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\
+                                        void *offset, void *dest)       \
+{                                                                       \
+        void *vaddr = (void *)translate_user_vaddr(offset);             \
+                                                                        \
+        FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest);               \
+}
+DEFINE_BASIC_FETCH_FUNCS(file_offset)
+DEFINE_FETCH_file_offset(string)
+DEFINE_FETCH_file_offset(string_size)
+/* Fetch type information table */
+const struct fetch_type uprobes_fetch_type_table[] = {
+        /* Special types */
+        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+                                        sizeof(u32), 1, "__data_loc char[]"),
+        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+                                        string_size, sizeof(u32), 0, "u32"),
+        /* Basic types */
+        ASSIGN_FETCH_TYPE(u8,  u8,  0),
+        ASSIGN_FETCH_TYPE(u16, u16, 0),
+        ASSIGN_FETCH_TYPE(u32, u32, 0),
+        ASSIGN_FETCH_TYPE(u64, u64, 0),
+        ASSIGN_FETCH_TYPE(s8,  u8,  1),
+        ASSIGN_FETCH_TYPE(s16, u16, 1),
+        ASSIGN_FETCH_TYPE(s32, u32, 1),
+        ASSIGN_FETCH_TYPE(s64, u64, 1),
+        ASSIGN_FETCH_TYPE_END
+};
 static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
 {
        rwlock_init(&filter->rwlock);
@@ -114,13 +250,13 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
        if (!tu)
                return ERR_PTR(-ENOMEM);
-        tu->call.class = &tu->class;
+        tu->tp.call.class = &tu->tp.class;
-        tu->call.name = kstrdup(event, GFP_KERNEL);
+        tu->tp.call.name = kstrdup(event, GFP_KERNEL);
-        if (!tu->call.name)
+        if (!tu->tp.call.name)
                goto error;
-        tu->class.system = kstrdup(group, GFP_KERNEL);
+        tu->tp.class.system = kstrdup(group, GFP_KERNEL);
-        if (!tu->class.system)
+        if (!tu->tp.class.system)
                goto error;
        INIT_LIST_HEAD(&tu->list);
@@ -128,11 +264,11 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
        if (is_ret)
                tu->consumer.ret_handler = uretprobe_dispatcher;
        init_trace_uprobe_filter(&tu->filter);
-        tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
+        tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
        return tu;
 error:
-        kfree(tu->call.name);
+        kfree(tu->tp.call.name);
        kfree(tu);
        return ERR_PTR(-ENOMEM);
@@ -142,12 +278,12 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
 {
        int i;
-        for (i = 0; i < tu->nr_args; i++)
+        for (i = 0; i < tu->tp.nr_args; i++)
-                traceprobe_free_probe_arg(&tu->args[i]);
+                traceprobe_free_probe_arg(&tu->tp.args[i]);
        iput(tu->inode);
-        kfree(tu->call.class->system);
+        kfree(tu->tp.call.class->system);
-        kfree(tu->call.name);
+        kfree(tu->tp.call.name);
        kfree(tu->filename);
        kfree(tu);
 }
@@ -157,8 +293,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
        struct trace_uprobe *tu;
        list_for_each_entry(tu, &uprobe_list, list)
-                if (strcmp(tu->call.name, event) == 0 &&
+                if (strcmp(tu->tp.call.name, event) == 0 &&
-                    strcmp(tu->call.class->system, group) == 0)
+                    strcmp(tu->tp.call.class->system, group) == 0)
                        return tu;
        return NULL;
@@ -181,16 +317,16 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
 /* Register a trace_uprobe and probe_event */
 static int register_trace_uprobe(struct trace_uprobe *tu)
 {
-        struct trace_uprobe *old_tp;
+        struct trace_uprobe *old_tu;
        int ret;
        mutex_lock(&uprobe_lock);
        /* register as an event */
-        old_tp = find_probe_event(tu->call.name, tu->call.class->system);
+        old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system);
-        if (old_tp) {
+        if (old_tu) {
                /* delete old event */
-                ret = unregister_trace_uprobe(old_tp);
+                ret = unregister_trace_uprobe(old_tu);
                if (ret)
                        goto end;
        }
@@ -211,7 +347,7 @@ end:
 /*
 * Argument syntax:
- *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
+ *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
 *
 *  - Remove uprobe: -:[GRP/]EVENT
 */
@@ -360,34 +496,36 @@ static int create_trace_uprobe(int argc, char **argv)
        /* parse arguments */
        ret = 0;
        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+                struct probe_arg *parg = &tu->tp.args[i];
                /* Increment count for freeing args in error case */
-                tu->nr_args++;
+                tu->tp.nr_args++;
                /* Parse argument name */
                arg = strchr(argv[i], '=');
                if (arg) {
                        *arg++ = '\0';
-                        tu->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+                        parg->name = kstrdup(argv[i], GFP_KERNEL);
                } else {
                        arg = argv[i];
                        /* If argument name is omitted, set "argN" */
                        snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
-                        tu->args[i].name = kstrdup(buf, GFP_KERNEL);
+                        parg->name = kstrdup(buf, GFP_KERNEL);
                }
-                if (!tu->args[i].name) {
+                if (!parg->name) {
                        pr_info("Failed to allocate argument[%d] name.\n", i);
                        ret = -ENOMEM;
                        goto error;
                }
-                if (!is_good_name(tu->args[i].name)) {
+                if (!is_good_name(parg->name)) {
-                        pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name);
+                        pr_info("Invalid argument[%d] name: %s\n", i, parg->name);
                        ret = -EINVAL;
                        goto error;
                }
-                if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) {
+                if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) {
                        pr_info("Argument[%d] name '%s' conflicts with "
                                "another field.\n", i, argv[i]);
                        ret = -EINVAL;
@@ -395,7 +533,8 @@ static int create_trace_uprobe(int argc, char **argv)
                }
                /* Parse fetch argument */
-                ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false);
+                ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
+                                                 is_return, false);
                if (ret) {
                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
                        goto error;
@@ -459,11 +598,11 @@ static int probes_seq_show(struct seq_file *m, void *v)
        char c = is_ret_probe(tu) ? 'r' : 'p';
        int i;
-        seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
+        seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name);
        seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
-        for (i = 0; i < tu->nr_args; i++)
+        for (i = 0; i < tu->tp.nr_args; i++)
-                seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm);
+                seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
        seq_printf(m, "\n");
        return 0;
@@ -509,7 +648,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
        struct trace_uprobe *tu = v;
-        seq_printf(m, "  %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit);
+        seq_printf(m, "  %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit);
        return 0;
 }
@@ -533,21 +672,117 @@ static const struct file_operations uprobe_profile_ops = {
        .release        = seq_release,
 };
+struct uprobe_cpu_buffer {
+        struct mutex mutex;
+        void *buf;
+};
+static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
+static int uprobe_buffer_refcnt;
+static int uprobe_buffer_init(void)
+{
+        int cpu, err_cpu;
+        uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer);
+        if (uprobe_cpu_buffer == NULL)
+                return -ENOMEM;
+        for_each_possible_cpu(cpu) {
+                struct page *p = alloc_pages_node(cpu_to_node(cpu),
+                                                  GFP_KERNEL, 0);
+                if (p == NULL) {
+                        err_cpu = cpu;
+                        goto err;
+                }
+                per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p);
+                mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex);
+        }
+        return 0;
+err:
+        for_each_possible_cpu(cpu) {
+                if (cpu == err_cpu)
+                        break;
+                free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf);
+        }
+        free_percpu(uprobe_cpu_buffer);
+        return -ENOMEM;
+}
+static int uprobe_buffer_enable(void)
+{
+        int ret = 0;
+        BUG_ON(!mutex_is_locked(&event_mutex));
+        if (uprobe_buffer_refcnt++ == 0) {
+                ret = uprobe_buffer_init();
+                if (ret < 0)
+                        uprobe_buffer_refcnt--;
+        }
+        return ret;
+}
+static void uprobe_buffer_disable(void)
+{
+        BUG_ON(!mutex_is_locked(&event_mutex));
+        if (--uprobe_buffer_refcnt == 0) {
+                free_percpu(uprobe_cpu_buffer);
+                uprobe_cpu_buffer = NULL;
+        }
+}
+static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
+{
+        struct uprobe_cpu_buffer *ucb;
+        int cpu;
+        cpu = raw_smp_processor_id();
+        ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu);
+        /*
+         * Use per-cpu buffers for fastest access, but we might migrate
+         * so the mutex makes sure we have sole access to it.
+         */
+        mutex_lock(&ucb->mutex);
+        return ucb;
+}
+static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
+{
+        mutex_unlock(&ucb->mutex);
+}
 static void uprobe_trace_print(struct trace_uprobe *tu,
                                unsigned long func, struct pt_regs *regs)
 {
        struct uprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
+        struct uprobe_cpu_buffer *ucb;
        void *data;
-        int size, i;
+        int size, dsize, esize;
-        struct ftrace_event_call *call = &tu->call;
+        struct ftrace_event_call *call = &tu->tp.call;
+        dsize = __get_data_size(&tu->tp, regs);
+        esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-        size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+        if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE))
+                return;
+        ucb = uprobe_buffer_get();
+        store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+        size = esize + tu->tp.size + dsize;
        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-                                                  size + tu->size, 0, 0);
+                                                  size, 0, 0);
        if (!event)
-                return;
+                goto out;
        entry = ring_buffer_event_data(event);
        if (is_ret_probe(tu)) {
@@ -559,11 +794,13 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
                data = DATAOF_TRACE_ENTRY(entry, false);
        }
-        for (i = 0; i < tu->nr_args; i++)
+        memcpy(data, ucb->buf, tu->tp.size + dsize);
-                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit(buffer, event, 0, 0);
+out:
+        uprobe_buffer_put(ucb);
 }
 /* uprobe handler */
@@ -591,23 +828,24 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
        int i;
        entry = (struct uprobe_trace_entry_head *)iter->ent;
-        tu = container_of(event, struct trace_uprobe, call.event);
+        tu = container_of(event, struct trace_uprobe, tp.call.event);
        if (is_ret_probe(tu)) {
-                if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
+                if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name,
                                        entry->vaddr[1], entry->vaddr[0]))
                        goto partial;
                data = DATAOF_TRACE_ENTRY(entry, true);
        } else {
-                if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
+                if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name,
                                        entry->vaddr[0]))
                        goto partial;
                data = DATAOF_TRACE_ENTRY(entry, false);
        }
-        for (i = 0; i < tu->nr_args; i++) {
+        for (i = 0; i < tu->tp.nr_args; i++) {
-                if (!tu->args[i].type->print(s, tu->args[i].name,
+                struct probe_arg *parg = &tu->tp.args[i];
-                                             data + tu->args[i].offset, entry))
+                if (!parg->type->print(s, parg->name, data + parg->offset, entry))
                        goto partial;
        }
@@ -618,11 +856,6 @@ partial:
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
-{
-        return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
-}
 typedef bool (*filter_func_t)(struct uprobe_consumer *self,
                                enum uprobe_filter_ctx ctx,
                                struct mm_struct *mm);
@@ -632,29 +865,35 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
 {
        int ret = 0;
-        if (is_trace_uprobe_enabled(tu))
+        if (trace_probe_is_enabled(&tu->tp))
                return -EINTR;
+        ret = uprobe_buffer_enable();
+        if (ret < 0)
+                return ret;
        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
-        tu->flags |= flag;
+        tu->tp.flags |= flag;
        tu->consumer.filter = filter;
        ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
        if (ret)
-                tu->flags &= ~flag;
+                tu->tp.flags &= ~flag;
        return ret;
 }
 static void probe_event_disable(struct trace_uprobe *tu, int flag)
 {
-        if (!is_trace_uprobe_enabled(tu))
+        if (!trace_probe_is_enabled(&tu->tp))
                return;
        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
        uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
-        tu->flags &= ~flag;
+        tu->tp.flags &= ~flag;
+        uprobe_buffer_disable();
 }
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -672,12 +911,12 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
                size = SIZEOF_TRACE_ENTRY(false);
        }
        /* Set argument names as fields */
-        for (i = 0; i < tu->nr_args; i++) {
+        for (i = 0; i < tu->tp.nr_args; i++) {
-                ret = trace_define_field(event_call, tu->args[i].type->fmttype,
+                struct probe_arg *parg = &tu->tp.args[i];
-                                         tu->args[i].name,
-                                         size + tu->args[i].offset,
+                ret = trace_define_field(event_call, parg->type->fmttype,
-                                         tu->args[i].type->size,
+                                         parg->name, size + parg->offset,
-                                         tu->args[i].type->is_signed,
+                                         parg->type->size, parg->type->is_signed,
                                         FILTER_OTHER);
                if (ret)
@@ -686,59 +925,6 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
        return 0;
 }
-#define LEN_OR_ZERO             (len ? len - pos : 0)
-static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
-{
-        const char *fmt, *arg;
-        int i;
-        int pos = 0;
-        if (is_ret_probe(tu)) {
-                fmt = "(%lx <- %lx)";
-                arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
-        } else {
-                fmt = "(%lx)";
-                arg = "REC->" FIELD_STRING_IP;
-        }
-        /* When len=0, we just calculate the needed length */
-        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
-        for (i = 0; i < tu->nr_args; i++) {
-                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
-                                tu->args[i].name, tu->args[i].type->fmt);
-        }
-        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
-        for (i = 0; i < tu->nr_args; i++) {
-                pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
-                                tu->args[i].name);
-        }
-        return pos;     /* return the length of print_fmt */
-}
-#undef LEN_OR_ZERO
-static int set_print_fmt(struct trace_uprobe *tu)
-{
-        char *print_fmt;
-        int len;
-        /* First: called with 0 length to calculate the needed length */
-        len = __set_print_fmt(tu, NULL, 0);
-        print_fmt = kmalloc(len + 1, GFP_KERNEL);
-        if (!print_fmt)
-                return -ENOMEM;
-        /* Second: actually write the @print_fmt */
-        __set_print_fmt(tu, print_fmt, len + 1);
-        tu->call.print_fmt = print_fmt;
-        return 0;
-}
 #ifdef CONFIG_PERF_EVENTS
 static bool
 __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
@@ -831,14 +1017,27 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
 static void uprobe_perf_print(struct trace_uprobe *tu,
                                unsigned long func, struct pt_regs *regs)
 {
-        struct ftrace_event_call *call = &tu->call;
+        struct ftrace_event_call *call = &tu->tp.call;
        struct uprobe_trace_entry_head *entry;
        struct hlist_head *head;
+        struct uprobe_cpu_buffer *ucb;
        void *data;
-        int size, rctx, i;
+        int size, dsize, esize;
+        int rctx;
-        size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+        dsize = __get_data_size(&tu->tp, regs);
-        size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
+        esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+        if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+                return;
+        size = esize + tu->tp.size + dsize;
+        size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
+                return;
+        ucb = uprobe_buffer_get();
+        store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
        preempt_disable();
        head = this_cpu_ptr(call->perf_events);
@@ -858,12 +1057,18 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
                data = DATAOF_TRACE_ENTRY(entry, false);
        }
-        for (i = 0; i < tu->nr_args; i++)
+        memcpy(data, ucb->buf, tu->tp.size + dsize);
-                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
+        if (size - esize > tu->tp.size + dsize) {
+                int len = tu->tp.size + dsize;
+                memset(data + len, 0, size - esize - len);
+        }
        perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
 out:
        preempt_enable();
+        uprobe_buffer_put(ucb);
 }
 /* uprobe profile handler */
@@ -921,16 +1126,22 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 {
        struct trace_uprobe *tu;
+        struct uprobe_dispatch_data udd;
        int ret = 0;
        tu = container_of(con, struct trace_uprobe, consumer);
        tu->nhit++;
-        if (tu->flags & TP_FLAG_TRACE)
+        udd.tu = tu;
+        udd.bp_addr = instruction_pointer(regs);
+        current->utask->vaddr = (unsigned long) &udd;
+        if (tu->tp.flags & TP_FLAG_TRACE)
                ret |= uprobe_trace_func(tu, regs);
 #ifdef CONFIG_PERF_EVENTS
-        if (tu->flags & TP_FLAG_PROFILE)
+        if (tu->tp.flags & TP_FLAG_PROFILE)
                ret |= uprobe_perf_func(tu, regs);
 #endif
        return ret;
@@ -940,14 +1151,20 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
                                unsigned long func, struct pt_regs *regs)
 {
        struct trace_uprobe *tu;
+        struct uprobe_dispatch_data udd;
        tu = container_of(con, struct trace_uprobe, consumer);
-        if (tu->flags & TP_FLAG_TRACE)
+        udd.tu = tu;
+        udd.bp_addr = func;
+        current->utask->vaddr = (unsigned long) &udd;
+        if (tu->tp.flags & TP_FLAG_TRACE)
                uretprobe_trace_func(tu, func, regs);
 #ifdef CONFIG_PERF_EVENTS
-        if (tu->flags & TP_FLAG_PROFILE)
+        if (tu->tp.flags & TP_FLAG_PROFILE)
                uretprobe_perf_func(tu, func, regs);
 #endif
        return 0;
@@ -959,7 +1176,7 @@ static struct trace_event_functions uprobe_funcs = {
 static int register_uprobe_event(struct trace_uprobe *tu)
 {
-        struct ftrace_event_call *call = &tu->call;
+        struct ftrace_event_call *call = &tu->tp.call;
        int ret;
        /* Initialize ftrace_event_call */
@@ -967,7 +1184,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
        call->event.funcs = &uprobe_funcs;
        call->class->define_fields = uprobe_event_define_fields;
-        if (set_print_fmt(tu) < 0)
+        if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
                return -ENOMEM;
        ret = register_ftrace_event(&call->event);
@@ -994,11 +1211,11 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
        int ret;
        /* tu->event is unregistered in trace_remove_event_call() */
-        ret = trace_remove_event_call(&tu->call);
+        ret = trace_remove_event_call(&tu->tp.call);
        if (ret)
                return ret;
-        kfree(tu->call.print_fmt);
+        kfree(tu->tp.call.print_fmt);
-        tu->call.print_fmt = NULL;
+        tu->tp.call.print_fmt = NULL;
        return 0;
 }
diff --git a/kernel/user.c b/kernel/user.c
index a3a0dbfda329..c006131beb77 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,9 +51,9 @@ struct user_namespace init_user_ns = {
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
        .proc_inum = PROC_USER_INIT_INO,
-#ifdef CONFIG_KEYS_KERBEROS_CACHE
+#ifdef CONFIG_PERSISTENT_KEYRINGS
-        .krb_cache_register_sem =
+        .persistent_keyring_register_sem =
-        __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem),
+        __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
 #endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d03ebc..82ef9f3b7473 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
 /* I: attributes used when instantiating standard unbound pools on demand */
 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
 struct workqueue_struct *system_wq __read_mostly;
 EXPORT_SYMBOL(system_wq);
 struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }
 static inline void debug_work_deactivate(struct work_struct *work) { }
 #endif
-/* allocate ID and assign it to @pool */
+/**
+ * worker_pool_assign_id - allocate ID and assing it to @pool
+ * @pool: the pool pointer of interest
+ *
+ * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
+ * successfully, -errno on failure.
+ */
 static int worker_pool_assign_id(struct worker_pool *pool)
 {
        int ret;
        lockdep_assert_held(&wq_pool_mutex);
-        ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
+                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
-        /* if dying, only works from the same workqueue are allowed */
+        /* if draining, only works from the same workqueue are allowed */
        if (unlikely(wq->flags & __WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)
        if (IS_ERR(worker->task))
                goto fail;
+        set_user_nice(worker->task, pool->attrs->nice);
+        /* prevent userland from meddling with cpumask of workqueue workers */
+        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
         * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
         * online CPUs.  It'll be re-applied when any of the CPUs come up.
         */
-        set_user_nice(worker->task, pool->attrs->nice);
        set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
-        /* prevent userland from meddling with cpumask of workqueue workers */
-        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
         * The caller is responsible for ensuring %POOL_DISASSOCIATED
         * remains stable across this function.  See the comments above the
@@ -2840,19 +2851,6 @@ already_gone:
        return false;
 }
-static bool __flush_work(struct work_struct *work)
-{
-        struct wq_barrier barr;
-        if (start_flush_work(work, &barr)) {
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
-                return true;
-        } else {
-                return false;
-        }
-}
 /**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
@@ -2866,10 +2864,18 @@ static bool __flush_work(struct work_struct *work)
 */
 bool flush_work(struct work_struct *work)
 {
+        struct wq_barrier barr;
        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);
-        return __flush_work(work);
+        if (start_flush_work(work, &barr)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else {
+                return false;
+        }
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -4106,7 +4112,7 @@ out_unlock:
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
        bool highpri = wq->flags & WQ_HIGHPRI;
-        int cpu;
+        int cpu, ret;
        if (!(wq->flags & WQ_UNBOUND)) {
                wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4126,6 +4132,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
                        mutex_unlock(&wq->mutex);
                }
                return 0;
+        } else if (wq->flags & __WQ_ORDERED) {
+                ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+                /* there should only be single pwq for ordering guarantee */
+                WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+                              wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+                     "ordering guarantee broken for workqueue %s\n", wq->name);
+                return ret;
        } else {
                return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
@@ -4776,6 +4789,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb,
                /* wait for per-cpu unbinding to finish */
                flush_work(&unbind_work);
+                destroy_work_on_stack(&unbind_work);
                break;
        }
        return NOTIFY_OK;
@@ -4814,14 +4828,8 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
        schedule_work_on(cpu, &wfc.work);
+        flush_work(&wfc.work);
-        /*
+        destroy_work_on_stack(&wfc.work);
-         * The work item is on-stack and can't lead to deadlock through
-         * flushing.  Use __flush_work() to avoid spurious lockdep warnings
-         * when work_on_cpu()s are nested.
-         */
-        __flush_work(&wfc.work);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -5009,10 +5017,6 @@ static int __init init_workqueues(void)
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        int i, cpu;
-        /* make sure we have enough bits for OFFQ pool ID */
-        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
-                     WORK_CPU_END * NR_STD_WORKER_POOLS);
        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5051,13 +5055,23 @@ static int __init init_workqueues(void)
                }
        }
-        /* create default unbound wq attrs */
+        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;
                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;
+                /*
+                 * An ordered wq should have only one pwq as ordering is
+                 * guaranteed by max_active which is enforced by pwqs.
+                 * Turn off NUMA so that dfl_pwq is used for all nodes.
+                 */
+                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+                attrs->nice = std_nice[i];
+                attrs->no_numa = true;
+                ordered_wq_attrs[i] = attrs;
        }
        system_wq = alloc_workqueue("events", 0, 0);