60 files changed, 2846 insertions, 1692 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 5404911eaee9..0dfeca4324ee 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
@@ -130,3 +131,79 @@ quiet_cmd_timeconst  = TIMEC   $@
 targets += timeconst.h
 $(obj)/timeconst.h: $(src)/timeconst.pl FORCE
        $(call if_changed,timeconst)
+ifeq ($(CONFIG_MODULE_SIG),y)
+#
+# Pull the signing certificate and any extra certificates into the kernel
+#
+extra_certificates:
+        touch $@
+kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+sign_key_with_hash :=
+ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
+sign_key_with_hash := -sha1
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
+sign_key_with_hash := -sha224
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
+sign_key_with_hash := -sha256
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
+sign_key_with_hash := -sha384
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
+sign_key_with_hash := -sha512
+endif
+ifeq ($(sign_key_with_hash),)
+$(error Could not determine digest type to use from kernel config)
+endif
+signing_key.priv signing_key.x509: x509.genkey
+        @echo "###"
+        @echo "### Now generating an X.509 key pair to be used for signing modules."
+        @echo "###"
+        @echo "### If this takes a long time, you might wish to run rngd in the"
+        @echo "### background to keep the supply of entropy topped up.  It"
+        @echo "### needs to be run as root, and should use a hardware random"
+        @echo "### number generator if one is available, eg:"
+        @echo "###"
+        @echo "###     rngd -r /dev/hwrandom"
+        @echo "###"
+        openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
+                -x509 -config x509.genkey \
+                -outform DER -out signing_key.x509 \
+                -keyout signing_key.priv
+        @echo "###"
+        @echo "### Key pair generated."
+        @echo "###"
+x509.genkey:
+        @echo Generating X.509 key generation config
+        @echo  >x509.genkey "[ req ]"
+        @echo >>x509.genkey "default_bits = 4096"
+        @echo >>x509.genkey "distinguished_name = req_distinguished_name"
+        @echo >>x509.genkey "prompt = no"
+        @echo >>x509.genkey "string_mask = utf8only"
+        @echo >>x509.genkey "x509_extensions = myexts"
+        @echo >>x509.genkey
+        @echo >>x509.genkey "[ req_distinguished_name ]"
+        @echo >>x509.genkey "O = Magrathea"
+        @echo >>x509.genkey "CN = Glacier signing key"
+        @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+        @echo >>x509.genkey
+        @echo >>x509.genkey "[ myexts ]"
+        @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
+        @echo >>x509.genkey "keyUsage=digitalSignature"
+        @echo >>x509.genkey "subjectKeyIdentifier=hash"
+        @echo >>x509.genkey "authorityKeyIdentifier=keyid"
+endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 02e6167a53b0..051e071a06e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
        }
 }
-static int acct_on(char *name)
+static int acct_on(struct filename *pathname)
 {
        struct file *file;
        struct vfsmount *mnt;
@@ -201,7 +201,7 @@ static int acct_on(char *name)
        struct bsd_acct_struct *acct = NULL;
        /* Difference from BSD - they don't do O_APPEND */
-        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+        file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
        if (IS_ERR(file))
                return PTR_ERR(file);
@@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
                return -EPERM;
        if (name) {
-                char *tmp = getname(name);
+                struct filename *tmp = getname(name);
                if (IS_ERR(tmp))
                        return (PTR_ERR(tmp));
                error = acct_on(tmp);
@@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
        do_div(elapsed, AHZ);
        ac.ac_btime = get_seconds() - elapsed;
        /* we really need to bite the bullet and change layout */
-        ac.ac_uid = orig_cred->uid;
+        ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
-        ac.ac_gid = orig_cred->gid;
+        ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
 #if ACCT_VERSION==2
        ac.ac_ahz = AHZ;
 #endif
diff --git a/kernel/audit.c b/kernel/audit.c
index ea3b7b6191c7..40414e9143db 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,6 +61,7 @@
 #include <linux/netlink.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
+#include <linux/pid_namespace.h>
 #include "audit.h"
@@ -87,11 +88,11 @@ static int	audit_failure = AUDIT_FAIL_PRINTK;
 /*
 * If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_pid contains
+ * contains the pid of the auditd process and audit_nlk_portid contains
- * the pid to use to send netlink messages to that process.
+ * the portid to use to send netlink messages to that process.
 */
 int             audit_pid;
-static int      audit_nlk_pid;
+static int      audit_nlk_portid;
 /* If audit_rate_limit is non-zero, limit the rate of sending audit records
 * to that number per second.  This prevents DoS attacks, but results in
@@ -104,7 +105,7 @@ static int	audit_backlog_wait_time = 60 * HZ;
 static int      audit_backlog_wait_overflow = 0;
 /* The identity of the user shutting down the audit system. */
-uid_t           audit_sig_uid = -1;
+kuid_t          audit_sig_uid = INVALID_UID;
 pid_t           audit_sig_pid = -1;
 u32             audit_sig_sid = 0;
@@ -264,7 +265,7 @@ void audit_log_lost(const char *message)
 }
 static int audit_log_config_change(char *function_name, int new, int old,
-                                   uid_t loginuid, u32 sessionid, u32 sid,
+                                   kuid_t loginuid, u32 sessionid, u32 sid,
                                   int allow_changes)
 {
        struct audit_buffer *ab;
@@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
-                         old, loginuid, sessionid);
+                         old, from_kuid(&init_user_ns, loginuid), sessionid);
        if (sid) {
                char *ctx = NULL;
                u32 len;
@@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
 }
 static int audit_do_config_change(char *function_name, int *to_change,
-                                  int new, uid_t loginuid, u32 sessionid,
+                                  int new, kuid_t loginuid, u32 sessionid,
                                  u32 sid)
 {
        int allow_changes, rc = 0, old = *to_change;
@@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change,
        return rc;
 }
-static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
+static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid,
                                u32 sid)
 {
        return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
                                      limit, loginuid, sessionid, sid);
 }
-static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
+static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid,
                                   u32 sid)
 {
        return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
                                      limit, loginuid, sessionid, sid);
 }
-static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
 {
        int rc;
        if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
        return rc;
 }
-static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid)
 {
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
@@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        int err;
        /* take a reference in case we can't send it and we want to hold it */
        skb_get(skb);
-        err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+        err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
        if (err < 0) {
                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
@@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy)
        return 0;
 }
-static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
-{
-        struct task_struct *tsk;
-        int err;
-        rcu_read_lock();
-        tsk = find_task_by_vpid(pid);
-        if (!tsk) {
-                rcu_read_unlock();
-                return -ESRCH;
-        }
-        get_task_struct(tsk);
-        rcu_read_unlock();
-        err = tty_audit_push_task(tsk, loginuid, sessionid);
-        put_task_struct(tsk);
-        return err;
-}
 int audit_send_list(void *_dest)
 {
        struct audit_netlink_list *dest = _dest;
@@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 {
        int err = 0;
+        /* Only support the initial namespaces for now. */
+        if ((current_user_ns() != &init_user_ns) ||
+            (task_active_pid_ns(current) != &init_pid_ns))
+                return -EPERM;
        switch (msg_type) {
        case AUDIT_GET:
        case AUDIT_LIST:
@@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 }
 static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
-                                     u32 pid, u32 uid, uid_t auid, u32 ses,
+                                     kuid_t auid, u32 ses, u32 sid)
-                                     u32 sid)
 {
        int rc = 0;
        char *ctx = NULL;
@@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
        audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
-                         pid, uid, auid, ses);
+                         task_tgid_vnr(current),
+                         from_kuid(&init_user_ns, current_uid()),
+                         from_kuid(&init_user_ns, auid), ses);
        if (sid) {
                rc = security_secid_to_secctx(sid, &ctx, &len);
                if (rc)
@@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-        u32                     uid, pid, seq, sid;
+        u32                     seq, sid;
        void                    *data;
        struct audit_status     *status_get, status_set;
        int                     err;
        struct audit_buffer     *ab;
        u16                     msg_type = nlh->nlmsg_type;
-        uid_t                   loginuid; /* loginuid of sender */
+        kuid_t                  loginuid; /* loginuid of sender */
        u32                     sessionid;
        struct audit_sig_info   *sig_data;
        char                    *ctx = NULL;
@@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                return err;
        }
-        pid  = NETLINK_CREDS(skb)->pid;
-        uid  = NETLINK_CREDS(skb)->uid;
        loginuid = audit_get_loginuid(current);
        sessionid = audit_get_sessionid(current);
        security_task_getsecid(current, &sid);
@@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                status_set.backlog_limit = audit_backlog_limit;
                status_set.lost          = atomic_read(&audit_lost);
                status_set.backlog       = skb_queue_len(&audit_skb_queue);
-                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
+                audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
                                 &status_set, sizeof(status_set));
                break;
        case AUDIT_SET:
@@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                                        sessionid, sid, 1);
                        audit_pid = new_pid;
-                        audit_nlk_pid = NETLINK_CB(skb).pid;
+                        audit_nlk_portid = NETLINK_CB(skb).portid;
                }
                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(status_get->rate_limit,
@@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (!audit_enabled && msg_type != AUDIT_USER_AVC)
                        return 0;
-                err = audit_filter_user(&NETLINK_CB(skb));
+                err = audit_filter_user();
                if (err == 1) {
                        err = 0;
                        if (msg_type == AUDIT_USER_TTY) {
-                                err = audit_prepare_user_tty(pid, loginuid,
+                                err = tty_audit_push_task(current, loginuid,
                                                             sessionid);
                                if (err)
                                        break;
                        }
-                        audit_log_common_recv_msg(&ab, msg_type, pid, uid,
+                        audit_log_common_recv_msg(&ab, msg_type,
                                                  loginuid, sessionid, sid);
                        if (msg_type != AUDIT_USER_TTY)
@@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                        size--;
                                audit_log_n_untrustedstring(ab, data, size);
                        }
-                        audit_set_pid(ab, pid);
+                        audit_set_pid(ab, NETLINK_CB(skb).portid);
                        audit_log_end(ab);
                }
                break;
@@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (nlmsg_len(nlh) < sizeof(struct audit_rule))
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
-                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
+                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
-                                                  uid, loginuid, sessionid, sid);
+                                                  loginuid, sessionid, sid);
                        audit_log_format(ab, " audit_enabled=%d res=0",
                                         audit_enabled);
@@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                }
                /* fallthrough */
        case AUDIT_LIST:
-                err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
+                err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
-                                           uid, seq, data, nlmsg_len(nlh),
+                                           seq, data, nlmsg_len(nlh),
                                           loginuid, sessionid, sid);
                break;
        case AUDIT_ADD_RULE:
@@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
-                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
+                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
-                                                  uid, loginuid, sessionid, sid);
+                                                  loginuid, sessionid, sid);
                        audit_log_format(ab, " audit_enabled=%d res=0",
                                         audit_enabled);
@@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                }
                /* fallthrough */
        case AUDIT_LIST_RULES:
-                err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
+                err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
-                                           uid, seq, data, nlmsg_len(nlh),
+                                           seq, data, nlmsg_len(nlh),
                                           loginuid, sessionid, sid);
                break;
        case AUDIT_TRIM:
                audit_trim_trees();
-                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
+                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
-                                          uid, loginuid, sessionid, sid);
+                                          loginuid, sessionid, sid);
                audit_log_format(ab, " op=trim res=1");
                audit_log_end(ab);
@@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                /* OK, here comes... */
                err = audit_tag_tree(old, new);
-                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
+                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
-                                          uid, loginuid, sessionid, sid);
+                                          loginuid, sessionid, sid);
                audit_log_format(ab, " op=make_equiv old=");
                audit_log_untrustedstring(ab, old);
@@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                security_release_secctx(ctx, len);
                        return -ENOMEM;
                }
-                sig_data->uid = audit_sig_uid;
+                sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
                sig_data->pid = audit_sig_pid;
                if (audit_sig_sid) {
                        memcpy(sig_data->ctx, ctx, len);
                        security_release_secctx(ctx, len);
                }
-                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
+                audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
                                0, 0, sig_data, sizeof(*sig_data) + len);
                kfree(sig_data);
                break;
        case AUDIT_TTY_GET: {
                struct audit_tty_status s;
-                struct task_struct *tsk;
+                struct task_struct *tsk = current;
-                unsigned long flags;
+                spin_lock_irq(&tsk->sighand->siglock);
-                rcu_read_lock();
+                s.enabled = tsk->signal->audit_tty != 0;
-                tsk = find_task_by_vpid(pid);
+                spin_unlock_irq(&tsk->sighand->siglock);
-                if (tsk && lock_task_sighand(tsk, &flags)) {
-                        s.enabled = tsk->signal->audit_tty != 0;
+                audit_send_reply(NETLINK_CB(skb).portid, seq,
-                        unlock_task_sighand(tsk, &flags);
+                                 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
-                } else
-                        err = -ESRCH;
-                rcu_read_unlock();
-                if (!err)
-                        audit_send_reply(NETLINK_CB(skb).pid, seq,
-                                         AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_TTY_SET: {
                struct audit_tty_status *s;
-                struct task_struct *tsk;
+                struct task_struct *tsk = current;
-                unsigned long flags;
                if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
                        return -EINVAL;
                s = data;
                if (s->enabled != 0 && s->enabled != 1)
                        return -EINVAL;
-                rcu_read_lock();
-                tsk = find_task_by_vpid(pid);
+                spin_lock_irq(&tsk->sighand->siglock);
-                if (tsk && lock_task_sighand(tsk, &flags)) {
+                tsk->signal->audit_tty = s->enabled != 0;
-                        tsk->signal->audit_tty = s->enabled != 0;
+                spin_unlock_irq(&tsk->sighand->siglock);
-                        unlock_task_sighand(tsk, &flags);
-                } else
-                        err = -ESRCH;
-                rcu_read_unlock();
                break;
        }
        default:
@@ -971,8 +946,7 @@ static int __init audit_init(void)
        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
               audit_default ? "enabled" : "disabled");
-        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
+        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg);
-                                           THIS_MODULE, &cfg);
        if (!audit_sock)
                audit_panic("cannot initialize netlink socket");
        else
@@ -1466,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link)
        ab = audit_log_start(current->audit_context, GFP_KERNEL,
                             AUDIT_ANOM_LINK);
+        if (!ab)
+                return;
        audit_log_format(ab, "op=%s action=denied", operation);
        audit_log_format(ab, " pid=%d comm=", current->pid);
        audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/audit.h b/kernel/audit.h
index 816766803371..d51cba868e1b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,10 +74,15 @@ static inline int audit_hash_ino(u32 ino)
        return (ino & (AUDIT_INODE_BUCKETS-1));
 }
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
 extern int audit_match_class(int class, unsigned syscall);
 extern int audit_comparator(const u32 left, const u32 op, const u32 right);
-extern int audit_compare_dname_path(const char *dname, const char *path,
+extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
-                                    int *dirlen);
+extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
+extern int parent_len(const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
 extern struct sk_buff *     audit_make_reply(int pid, int seq, int type,
                                             int done, int multi,
                                             const void *payload, int size);
@@ -144,7 +149,7 @@ extern void audit_kill_trees(struct list_head *);
 extern char *audit_unpack_string(void **, size_t *, size_t);
 extern pid_t audit_sig_pid;
-extern uid_t audit_sig_uid;
+extern kuid_t audit_sig_uid;
 extern u32 audit_sig_sid;
 #ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 3823281401b5..9a9ae6e3d290 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
                struct audit_buffer *ab;
                ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, "auid=%u ses=%u op=",
-                                 audit_get_loginuid(current),
+                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                 audit_get_sessionid(current));
                audit_log_string(ab, op);
                audit_log_format(ab, " path=");
@@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent,
        /* Run all of the watches on this parent looking for the one that
         * matches the given dname */
        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-                if (audit_compare_dname_path(dname, owatch->path, NULL))
+                if (audit_compare_dname_path(dname, owatch->path,
+                                             AUDIT_NAME_FULL))
                        continue;
                /* If the update involves invalidating rules, do the inode-based
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a6c3f1abd206..7f19f23d38a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
                f->val = rule->values[i];
+                f->uid = INVALID_UID;
+                f->gid = INVALID_GID;
                err = -EINVAL;
                if (f->op == Audit_bad)
@@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                switch(f->type) {
                default:
                        goto exit_free;
-                case AUDIT_PID:
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
                case AUDIT_FSUID:
+                case AUDIT_LOGINUID:
+                        /* bit ops not implemented for uid comparisons */
+                        if (f->op == Audit_bitmask || f->op == Audit_bittest)
+                                goto exit_free;
+                        f->uid = make_kuid(current_user_ns(), f->val);
+                        if (!uid_valid(f->uid))
+                                goto exit_free;
+                        break;
                case AUDIT_GID:
                case AUDIT_EGID:
                case AUDIT_SGID:
                case AUDIT_FSGID:
-                case AUDIT_LOGINUID:
+                        /* bit ops not implemented for gid comparisons */
+                        if (f->op == Audit_bitmask || f->op == Audit_bittest)
+                                goto exit_free;
+                        f->gid = make_kgid(current_user_ns(), f->val);
+                        if (!gid_valid(f->gid))
+                                goto exit_free;
+                        break;
+                case AUDIT_PID:
                case AUDIT_PERS:
                case AUDIT_MSGTYPE:
                case AUDIT_PPID:
@@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                f->type = data->fields[i];
                f->val = data->values[i];
+                f->uid = INVALID_UID;
+                f->gid = INVALID_GID;
                f->lsm_str = NULL;
                f->lsm_rule = NULL;
                switch(f->type) {
-                case AUDIT_PID:
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
                case AUDIT_FSUID:
+                case AUDIT_LOGINUID:
+                case AUDIT_OBJ_UID:
+                        /* bit ops not implemented for uid comparisons */
+                        if (f->op == Audit_bitmask || f->op == Audit_bittest)
+                                goto exit_free;
+                        f->uid = make_kuid(current_user_ns(), f->val);
+                        if (!uid_valid(f->uid))
+                                goto exit_free;
+                        break;
                case AUDIT_GID:
                case AUDIT_EGID:
                case AUDIT_SGID:
                case AUDIT_FSGID:
-                case AUDIT_LOGINUID:
+                case AUDIT_OBJ_GID:
+                        /* bit ops not implemented for gid comparisons */
+                        if (f->op == Audit_bitmask || f->op == Audit_bittest)
+                                goto exit_free;
+                        f->gid = make_kgid(current_user_ns(), f->val);
+                        if (!gid_valid(f->gid))
+                                goto exit_free;
+                        break;
+                case AUDIT_PID:
                case AUDIT_PERS:
                case AUDIT_MSGTYPE:
                case AUDIT_PPID:
@@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                case AUDIT_ARG1:
                case AUDIT_ARG2:
                case AUDIT_ARG3:
-                case AUDIT_OBJ_UID:
-                case AUDIT_OBJ_GID:
                        break;
                case AUDIT_ARCH:
                        entry->rule.arch_f = f;
@@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                        if (strcmp(a->filterkey, b->filterkey))
                                return 1;
                        break;
+                case AUDIT_UID:
+                case AUDIT_EUID:
+                case AUDIT_SUID:
+                case AUDIT_FSUID:
+                case AUDIT_LOGINUID:
+                case AUDIT_OBJ_UID:
+                        if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
+                                return 1;
+                        break;
+                case AUDIT_GID:
+                case AUDIT_EGID:
+                case AUDIT_SGID:
+                case AUDIT_FSGID:
+                case AUDIT_OBJ_GID:
+                        if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
+                                return 1;
+                        break;
                default:
                        if (a->fields[i].val != b->fields[i].val)
                                return 1;
@@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 }
 /* Log rule additions and removals */
-static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
+static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
                                  char *action, struct audit_krule *rule,
                                  int res)
 {
@@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (!ab)
                return;
-        audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
+        audit_log_format(ab, "auid=%u ses=%u",
+                         from_kuid(&init_user_ns, loginuid), sessionid);
        if (sid) {
                char *ctx = NULL;
                u32 len;
@@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
 * @sessionid: sessionid for netlink audit message
 * @sid: SE Linux Security ID of sender
 */
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
+int audit_receive_filter(int type, int pid, int seq, void *data,
-                         size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
+                         size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)
 {
        struct task_struct *tsk;
        struct audit_netlink_list *dest;
@@ -1198,46 +1252,110 @@ int audit_comparator(u32 left, u32 op, u32 right)
        }
 }
-/* Compare given dentry name with last component in given path,
+int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
- * return of 0 indicates a match. */
-int audit_compare_dname_path(const char *dname, const char *path,
-                             int *dirlen)
 {
-        int dlen, plen;
+        switch (op) {
-        const char *p;
+        case Audit_equal:
+                return uid_eq(left, right);
+        case Audit_not_equal:
+                return !uid_eq(left, right);
+        case Audit_lt:
+                return uid_lt(left, right);
+        case Audit_le:
+                return uid_lte(left, right);
+        case Audit_gt:
+                return uid_gt(left, right);
+        case Audit_ge:
+                return uid_gte(left, right);
+        case Audit_bitmask:
+        case Audit_bittest:
+        default:
+                BUG();
+                return 0;
+        }
+}
-        if (!dname || !path)
+int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
-                return 1;
+{
+        switch (op) {
+        case Audit_equal:
+                return gid_eq(left, right);
+        case Audit_not_equal:
+                return !gid_eq(left, right);
+        case Audit_lt:
+                return gid_lt(left, right);
+        case Audit_le:
+                return gid_lte(left, right);
+        case Audit_gt:
+                return gid_gt(left, right);
+        case Audit_ge:
+                return gid_gte(left, right);
+        case Audit_bitmask:
+        case Audit_bittest:
+        default:
+                BUG();
+                return 0;
+        }
+}
+/**
+ * parent_len - find the length of the parent portion of a pathname
+ * @path: pathname of which to determine length
+ */
+int parent_len(const char *path)
+{
+        int plen;
+        const char *p;
-        dlen = strlen(dname);
        plen = strlen(path);
-        if (plen < dlen)
-                return 1;
+        if (plen == 0)
+                return plen;
        /* disregard trailing slashes */
        p = path + plen - 1;
        while ((*p == '/') && (p > path))
                p--;
-        /* find last path component */
+        /* walk backward until we find the next slash or hit beginning */
-        p = p - dlen + 1;
+        while ((*p != '/') && (p > path))
-        if (p < path)
+                p--;
+        /* did we find a slash? Then increment to include it in path */
+        if (*p == '/')
+                p++;
+        return p - path;
+}
+/**
+ * audit_compare_dname_path - compare given dentry name with last component in
+ *                            given path. Return of 0 indicates a match.
+ * @dname:      dentry name that we're comparing
+ * @path:       full pathname that we're comparing
+ * @parentlen:  length of the parent if known. Passing in AUDIT_NAME_FULL
+ *              here indicates that we must compute this value.
+ */
+int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+{
+        int dlen, pathlen;
+        const char *p;
+        dlen = strlen(dname);
+        pathlen = strlen(path);
+        if (pathlen < dlen)
                return 1;
-        else if (p > path) {
-                if (*--p != '/')
-                        return 1;
-                else
-                        p++;
-        }
-        /* return length of path's directory component */
+        parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
-        if (dirlen)
+        if (pathlen - parentlen != dlen)
-                *dirlen = p - path;
+                return 1;
+        p = path + parentlen;
        return strncmp(p, dname, dlen);
 }
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+static int audit_filter_user_rules(struct audit_krule *rule,
-                                   struct audit_krule *rule,
                                   enum audit_state *state)
 {
        int i;
@@ -1249,17 +1367,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
                switch (f->type) {
                case AUDIT_PID:
-                        result = audit_comparator(cb->creds.pid, f->op, f->val);
+                        result = audit_comparator(task_pid_vnr(current), f->op, f->val);
                        break;
                case AUDIT_UID:
-                        result = audit_comparator(cb->creds.uid, f->op, f->val);
+                        result = audit_uid_comparator(current_uid(), f->op, f->uid);
                        break;
                case AUDIT_GID:
-                        result = audit_comparator(cb->creds.gid, f->op, f->val);
+                        result = audit_gid_comparator(current_gid(), f->op, f->gid);
                        break;
                case AUDIT_LOGINUID:
-                        result = audit_comparator(audit_get_loginuid(current),
+                        result = audit_uid_comparator(audit_get_loginuid(current),
-                                                  f->op, f->val);
+                                                  f->op, f->uid);
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
@@ -1287,7 +1405,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
        return 1;
 }
-int audit_filter_user(struct netlink_skb_parms *cb)
+int audit_filter_user(void)
 {
        enum audit_state state = AUDIT_DISABLED;
        struct audit_entry *e;
@@ -1295,7 +1413,7 @@ int audit_filter_user(struct netlink_skb_parms *cb)
        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-                if (audit_filter_user_rules(cb, &e->rule, &state)) {
+                if (audit_filter_user_rules(&e->rule, &state)) {
                        if (state == AUDIT_DISABLED)
                                ret = 0;
                        break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4b96415527b8..2f186ed80c40 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -81,9 +81,6 @@
 * a name dynamically and also add those to the list anchored by names_list. */
 #define AUDIT_NAMES     5
-/* Indicates that audit should log the full pathname. */
-#define AUDIT_NAME_FULL -1
 /* no execve audit message should be longer than this (userspace limits) */
 #define MAX_EXECVE_AUDIT_LEN 7500
@@ -106,27 +103,29 @@ struct audit_cap_data {
 * we don't let putname() free it (instead we free all of the saved
 * pointers at syscall exit time).
 *
- * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
 struct audit_names {
-        struct list_head list;          /* audit_context->names_list */
+        struct list_head        list;           /* audit_context->names_list */
-        const char      *name;
+        struct filename *name;
-        unsigned long   ino;
+        unsigned long           ino;
-        dev_t           dev;
+        dev_t                   dev;
-        umode_t         mode;
+        umode_t                 mode;
-        uid_t           uid;
+        kuid_t                  uid;
-        gid_t           gid;
+        kgid_t                  gid;
-        dev_t           rdev;
+        dev_t                   rdev;
-        u32             osid;
+        u32                     osid;
-        struct audit_cap_data fcap;
+        struct audit_cap_data    fcap;
-        unsigned int    fcap_ver;
+        unsigned int            fcap_ver;
-        int             name_len;       /* number of name's characters to log */
+        int                     name_len;       /* number of name's characters to log */
-        bool            name_put;       /* call __putname() for this name */
+        unsigned char           type;           /* record type */
+        bool                    name_put;       /* call __putname() for this name */
        /*
         * This was an allocated audit_names and not from the array of
         * names allocated in the task audit context.  Thus this name
         * should be freed on syscall exit
         */
-        bool            should_free;
+        bool                    should_free;
 };
 struct audit_aux_data {
@@ -149,8 +148,8 @@ struct audit_aux_data_execve {
 struct audit_aux_data_pids {
        struct audit_aux_data   d;
        pid_t                   target_pid[AUDIT_AUX_PIDS];
-        uid_t                   target_auid[AUDIT_AUX_PIDS];
+        kuid_t                  target_auid[AUDIT_AUX_PIDS];
-        uid_t                   target_uid[AUDIT_AUX_PIDS];
+        kuid_t                  target_uid[AUDIT_AUX_PIDS];
        unsigned int            target_sessionid[AUDIT_AUX_PIDS];
        u32                     target_sid[AUDIT_AUX_PIDS];
        char                    target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -208,14 +207,14 @@ struct audit_context {
        size_t sockaddr_len;
                                /* Save things to print about task_struct */
        pid_t               pid, ppid;
-        uid_t               uid, euid, suid, fsuid;
+        kuid_t              uid, euid, suid, fsuid;
-        gid_t               gid, egid, sgid, fsgid;
+        kgid_t              gid, egid, sgid, fsgid;
        unsigned long       personality;
        int                 arch;
        pid_t               target_pid;
-        uid_t               target_auid;
+        kuid_t              target_auid;
-        uid_t               target_uid;
+        kuid_t              target_uid;
        unsigned int        target_sessionid;
        u32                 target_sid;
        char                target_comm[TASK_COMM_LEN];
@@ -231,8 +230,8 @@ struct audit_context {
                        long args[6];
                } socketcall;
                struct {
-                        uid_t                   uid;
+                        kuid_t                  uid;
-                        gid_t                   gid;
+                        kgid_t                  gid;
                        umode_t                 mode;
                        u32                     osid;
                        int                     has_perm;
@@ -464,37 +463,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
        return 0;
 }
-static int audit_compare_id(uid_t uid1,
+static int audit_compare_uid(kuid_t uid,
-                            struct audit_names *name,
+                             struct audit_names *name,
-                            unsigned long name_offset,
+                             struct audit_field *f,
-                            struct audit_field *f,
+                             struct audit_context *ctx)
-                            struct audit_context *ctx)
 {
        struct audit_names *n;
-        unsigned long addr;
-        uid_t uid2;
        int rc;
+ 
-        BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
        if (name) {
-                addr = (unsigned long)name;
+                rc = audit_uid_comparator(uid, f->op, name->uid);
-                addr += name_offset;
-                uid2 = *(uid_t *)addr;
-                rc = audit_comparator(uid1, f->op, uid2);
                if (rc)
                        return rc;
        }
+ 
        if (ctx) {
                list_for_each_entry(n, &ctx->names_list, list) {
-                        addr = (unsigned long)n;
+                        rc = audit_uid_comparator(uid, f->op, n->uid);
-                        addr += name_offset;
+                        if (rc)
+                                return rc;
-                        uid2 = *(uid_t *)addr;
+                }
+        }
+        return 0;
+}
-                        rc = audit_comparator(uid1, f->op, uid2);
+static int audit_compare_gid(kgid_t gid,
+                             struct audit_names *name,
+                             struct audit_field *f,
+                             struct audit_context *ctx)
+{
+        struct audit_names *n;
+        int rc;
+ 
+        if (name) {
+                rc = audit_gid_comparator(gid, f->op, name->gid);
+                if (rc)
+                        return rc;
+        }
+ 
+        if (ctx) {
+                list_for_each_entry(n, &ctx->names_list, list) {
+                        rc = audit_gid_comparator(gid, f->op, n->gid);
                        if (rc)
                                return rc;
                }
@@ -511,80 +520,62 @@ static int audit_field_compare(struct task_struct *tsk,
        switch (f->val) {
        /* process to file object comparisons */
        case AUDIT_COMPARE_UID_TO_OBJ_UID:
-                return audit_compare_id(cred->uid,
+                return audit_compare_uid(cred->uid, name, f, ctx);
-                                        name, offsetof(struct audit_names, uid),
-                                        f, ctx);
        case AUDIT_COMPARE_GID_TO_OBJ_GID:
-                return audit_compare_id(cred->gid,
+                return audit_compare_gid(cred->gid, name, f, ctx);
-                                        name, offsetof(struct audit_names, gid),
-                                        f, ctx);
        case AUDIT_COMPARE_EUID_TO_OBJ_UID:
-                return audit_compare_id(cred->euid,
+                return audit_compare_uid(cred->euid, name, f, ctx);
-                                        name, offsetof(struct audit_names, uid),
-                                        f, ctx);
        case AUDIT_COMPARE_EGID_TO_OBJ_GID:
-                return audit_compare_id(cred->egid,
+                return audit_compare_gid(cred->egid, name, f, ctx);
-                                        name, offsetof(struct audit_names, gid),
-                                        f, ctx);
        case AUDIT_COMPARE_AUID_TO_OBJ_UID:
-                return audit_compare_id(tsk->loginuid,
+                return audit_compare_uid(tsk->loginuid, name, f, ctx);
-                                        name, offsetof(struct audit_names, uid),
-                                        f, ctx);
        case AUDIT_COMPARE_SUID_TO_OBJ_UID:
-                return audit_compare_id(cred->suid,
+                return audit_compare_uid(cred->suid, name, f, ctx);
-                                        name, offsetof(struct audit_names, uid),
-                                        f, ctx);
        case AUDIT_COMPARE_SGID_TO_OBJ_GID:
-                return audit_compare_id(cred->sgid,
+                return audit_compare_gid(cred->sgid, name, f, ctx);
-                                        name, offsetof(struct audit_names, gid),
-                                        f, ctx);
        case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
-                return audit_compare_id(cred->fsuid,
+                return audit_compare_uid(cred->fsuid, name, f, ctx);
-                                        name, offsetof(struct audit_names, uid),
-                                        f, ctx);
        case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
-                return audit_compare_id(cred->fsgid,
+                return audit_compare_gid(cred->fsgid, name, f, ctx);
-                                        name, offsetof(struct audit_names, gid),
-                                        f, ctx);
        /* uid comparisons */
        case AUDIT_COMPARE_UID_TO_AUID:
-                return audit_comparator(cred->uid, f->op, tsk->loginuid);
+                return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
        case AUDIT_COMPARE_UID_TO_EUID:
-                return audit_comparator(cred->uid, f->op, cred->euid);
+                return audit_uid_comparator(cred->uid, f->op, cred->euid);
        case AUDIT_COMPARE_UID_TO_SUID:
-                return audit_comparator(cred->uid, f->op, cred->suid);
+                return audit_uid_comparator(cred->uid, f->op, cred->suid);
        case AUDIT_COMPARE_UID_TO_FSUID:
-                return audit_comparator(cred->uid, f->op, cred->fsuid);
+                return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
        /* auid comparisons */
        case AUDIT_COMPARE_AUID_TO_EUID:
-                return audit_comparator(tsk->loginuid, f->op, cred->euid);
+                return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
        case AUDIT_COMPARE_AUID_TO_SUID:
-                return audit_comparator(tsk->loginuid, f->op, cred->suid);
+                return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
        case AUDIT_COMPARE_AUID_TO_FSUID:
-                return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+                return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
        /* euid comparisons */
        case AUDIT_COMPARE_EUID_TO_SUID:
-                return audit_comparator(cred->euid, f->op, cred->suid);
+                return audit_uid_comparator(cred->euid, f->op, cred->suid);
        case AUDIT_COMPARE_EUID_TO_FSUID:
-                return audit_comparator(cred->euid, f->op, cred->fsuid);
+                return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
        /* suid comparisons */
        case AUDIT_COMPARE_SUID_TO_FSUID:
-                return audit_comparator(cred->suid, f->op, cred->fsuid);
+                return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
        /* gid comparisons */
        case AUDIT_COMPARE_GID_TO_EGID:
-                return audit_comparator(cred->gid, f->op, cred->egid);
+                return audit_gid_comparator(cred->gid, f->op, cred->egid);
        case AUDIT_COMPARE_GID_TO_SGID:
-                return audit_comparator(cred->gid, f->op, cred->sgid);
+                return audit_gid_comparator(cred->gid, f->op, cred->sgid);
        case AUDIT_COMPARE_GID_TO_FSGID:
-                return audit_comparator(cred->gid, f->op, cred->fsgid);
+                return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
        /* egid comparisons */
        case AUDIT_COMPARE_EGID_TO_SGID:
-                return audit_comparator(cred->egid, f->op, cred->sgid);
+                return audit_gid_comparator(cred->egid, f->op, cred->sgid);
        case AUDIT_COMPARE_EGID_TO_FSGID:
-                return audit_comparator(cred->egid, f->op, cred->fsgid);
+                return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
        /* sgid comparison */
        case AUDIT_COMPARE_SGID_TO_FSGID:
-                return audit_comparator(cred->sgid, f->op, cred->fsgid);
+                return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
        default:
                WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
                return 0;
@@ -630,28 +621,28 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_UID:
-                        result = audit_comparator(cred->uid, f->op, f->val);
+                        result = audit_uid_comparator(cred->uid, f->op, f->uid);
                        break;
                case AUDIT_EUID:
-                        result = audit_comparator(cred->euid, f->op, f->val);
+                        result = audit_uid_comparator(cred->euid, f->op, f->uid);
                        break;
                case AUDIT_SUID:
-                        result = audit_comparator(cred->suid, f->op, f->val);
+                        result = audit_uid_comparator(cred->suid, f->op, f->uid);
                        break;
                case AUDIT_FSUID:
-                        result = audit_comparator(cred->fsuid, f->op, f->val);
+                        result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
                        break;
                case AUDIT_GID:
-                        result = audit_comparator(cred->gid, f->op, f->val);
+                        result = audit_gid_comparator(cred->gid, f->op, f->gid);
                        break;
                case AUDIT_EGID:
-                        result = audit_comparator(cred->egid, f->op, f->val);
+                        result = audit_gid_comparator(cred->egid, f->op, f->gid);
                        break;
                case AUDIT_SGID:
-                        result = audit_comparator(cred->sgid, f->op, f->val);
+                        result = audit_gid_comparator(cred->sgid, f->op, f->gid);
                        break;
                case AUDIT_FSGID:
-                        result = audit_comparator(cred->fsgid, f->op, f->val);
+                        result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
                        break;
                case AUDIT_PERS:
                        result = audit_comparator(tsk->personality, f->op, f->val);
@@ -717,10 +708,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                        break;
                case AUDIT_OBJ_UID:
                        if (name) {
-                                result = audit_comparator(name->uid, f->op, f->val);
+                                result = audit_uid_comparator(name->uid, f->op, f->uid);
                        } else if (ctx) {
                                list_for_each_entry(n, &ctx->names_list, list) {
-                                        if (audit_comparator(n->uid, f->op, f->val)) {
+                                        if (audit_uid_comparator(n->uid, f->op, f->uid)) {
                                                ++result;
                                                break;
                                        }
@@ -729,10 +720,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                        break;
                case AUDIT_OBJ_GID:
                        if (name) {
-                                result = audit_comparator(name->gid, f->op, f->val);
+                                result = audit_gid_comparator(name->gid, f->op, f->gid);
                        } else if (ctx) {
                                list_for_each_entry(n, &ctx->names_list, list) {
-                                        if (audit_comparator(n->gid, f->op, f->val)) {
+                                        if (audit_gid_comparator(n->gid, f->op, f->gid)) {
                                                ++result;
                                                break;
                                        }
@@ -750,7 +741,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_LOGINUID:
                        result = 0;
                        if (ctx)
-                                result = audit_comparator(tsk->loginuid, f->op, f->val);
+                                result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
@@ -1006,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context)
                       context->ino_count);
                list_for_each_entry(n, &context->names_list, list) {
                        printk(KERN_ERR "names[%d] = %p = %s\n", i,
-                               n->name, n->name ?: "(null)");
+                               n->name, n->name->name ?: "(null)");
                }
                dump_stack();
                return;
@@ -1154,13 +1145,43 @@ error_path:
 EXPORT_SYMBOL(audit_log_task_context);
-static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
+        const struct cred *cred;
        char name[sizeof(tsk->comm)];
        struct mm_struct *mm = tsk->mm;
-        struct vm_area_struct *vma;
+        char *tty;
+        if (!ab)
+                return;
        /* tsk == current */
+        cred = current_cred();
+        spin_lock_irq(&tsk->sighand->siglock);
+        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+                tty = tsk->signal->tty->name;
+        else
+                tty = "(none)";
+        spin_unlock_irq(&tsk->sighand->siglock);
+        audit_log_format(ab,
+                         " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
+                         " euid=%u suid=%u fsuid=%u"
+                         " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
+                         sys_getppid(),
+                         tsk->pid,
+                         from_kuid(&init_user_ns, tsk->loginuid),
+                         from_kuid(&init_user_ns, cred->uid),
+                         from_kgid(&init_user_ns, cred->gid),
+                         from_kuid(&init_user_ns, cred->euid),
+                         from_kuid(&init_user_ns, cred->suid),
+                         from_kuid(&init_user_ns, cred->fsuid),
+                         from_kgid(&init_user_ns, cred->egid),
+                         from_kgid(&init_user_ns, cred->sgid),
+                         from_kgid(&init_user_ns, cred->fsgid),
+                         tsk->sessionid, tty);
        get_task_comm(name, tsk);
        audit_log_format(ab, " comm=");
@@ -1168,23 +1189,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
        if (mm) {
                down_read(&mm->mmap_sem);
-                vma = mm->mmap;
+                if (mm->exe_file)
-                while (vma) {
+                        audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
-                        if ((vma->vm_flags & VM_EXECUTABLE) &&
-                            vma->vm_file) {
-                                audit_log_d_path(ab, " exe=",
-                                                 &vma->vm_file->f_path);
-                                break;
-                        }
-                        vma = vma->vm_next;
-                }
                up_read(&mm->mmap_sem);
        }
        audit_log_task_context(ab);
 }
+EXPORT_SYMBOL(audit_log_task_info);
 static int audit_log_pid_context(struct audit_context *context, pid_t pid,
-                                 uid_t auid, uid_t uid, unsigned int sessionid,
+                                 kuid_t auid, kuid_t uid, unsigned int sessionid,
                                 u32 sid, char *comm)
 {
        struct audit_buffer *ab;
@@ -1196,8 +1211,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
        if (!ab)
                return rc;
-        audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
+        audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
-                         uid, sessionid);
+                         from_kuid(&init_user_ns, auid),
+                         from_kuid(&init_user_ns, uid), sessionid);
        if (security_secid_to_secctx(sid, &ctx, &len)) {
                audit_log_format(ab, " obj=(none)");
                rc = 1;
@@ -1447,7 +1463,9 @@ static void show_special(struct audit_context *context, int *call_panic)
                u32 osid = context->ipc.osid;
                audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
-                         context->ipc.uid, context->ipc.gid, context->ipc.mode);
+                                 from_kuid(&init_user_ns, context->ipc.uid),
+                                 from_kgid(&init_user_ns, context->ipc.gid),
+                                 context->ipc.mode);
                if (osid) {
                        char *ctx = NULL;
                        u32 len;
@@ -1536,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
                case AUDIT_NAME_FULL:
                        /* log the full path */
                        audit_log_format(ab, " name=");
-                        audit_log_untrustedstring(ab, n->name);
+                        audit_log_untrustedstring(ab, n->name->name);
                        break;
                case 0:
                        /* name was specified as a relative path and the
@@ -1546,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
                default:
                        /* log the name's directory component */
                        audit_log_format(ab, " name=");
-                        audit_log_n_untrustedstring(ab, n->name,
+                        audit_log_n_untrustedstring(ab, n->name->name,
                                                    n->name_len);
                }
        } else
@@ -1560,8 +1578,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
                                 MAJOR(n->dev),
                                 MINOR(n->dev),
                                 n->mode,
-                                 n->uid,
+                                 from_kuid(&init_user_ns, n->uid),
-                                 n->gid,
+                                 from_kgid(&init_user_ns, n->gid),
                                 MAJOR(n->rdev),
                                 MINOR(n->rdev));
        }
@@ -1585,26 +1603,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
-        const struct cred *cred;
        int i, call_panic = 0;
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
-        const char *tty;
        struct audit_names *n;
        /* tsk == current */
-        context->pid = tsk->pid;
-        if (!context->ppid)
-                context->ppid = sys_getppid();
-        cred = current_cred();
-        context->uid   = cred->uid;
-        context->gid   = cred->gid;
-        context->euid  = cred->euid;
-        context->suid  = cred->suid;
-        context->fsuid = cred->fsuid;
-        context->egid  = cred->egid;
-        context->sgid  = cred->sgid;
-        context->fsgid = cred->fsgid;
        context->personality = tsk->personality;
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1619,32 +1623,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                                 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
                                 context->return_code);
-        spin_lock_irq(&tsk->sighand->siglock);
-        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
-                tty = tsk->signal->tty->name;
-        else
-                tty = "(none)";
-        spin_unlock_irq(&tsk->sighand->siglock);
        audit_log_format(ab,
-                  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+                         " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
-                  " ppid=%d pid=%d auid=%u uid=%u gid=%u"
+                         context->argv[0],
-                  " euid=%u suid=%u fsuid=%u"
+                         context->argv[1],
-                  " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
+                         context->argv[2],
-                  context->argv[0],
+                         context->argv[3],
-                  context->argv[1],
+                         context->name_count);
-                  context->argv[2],
-                  context->argv[3],
-                  context->name_count,
-                  context->ppid,
-                  context->pid,
-                  tsk->loginuid,
-                  context->uid,
-                  context->gid,
-                  context->euid, context->suid, context->fsuid,
-                  context->egid, context->sgid, context->fsgid, tty,
-                  tsk->sessionid);
        audit_log_task_info(ab, tsk);
        audit_log_key(ab, context->filterkey);
@@ -2009,7 +1994,8 @@ retry:
 #endif
 }
-static struct audit_names *audit_alloc_name(struct audit_context *context)
+static struct audit_names *audit_alloc_name(struct audit_context *context,
+                                                unsigned char type)
 {
        struct audit_names *aname;
@@ -2024,6 +2010,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
        }
        aname->ino = (unsigned long)-1;
+        aname->type = type;
        list_add_tail(&aname->list, &context->names_list);
        context->name_count++;
@@ -2034,13 +2021,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
 }
 /**
+ * audit_reusename - fill out filename with info from existing entry
+ * @uptr: userland ptr to pathname
+ *
+ * Search the audit_names list for the current audit context. If there is an
+ * existing entry with a matching "uptr" then return the filename
+ * associated with that audit_name. If not, return NULL.
+ */
+struct filename *
+__audit_reusename(const __user char *uptr)
+{
+        struct audit_context *context = current->audit_context;
+        struct audit_names *n;
+        list_for_each_entry(n, &context->names_list, list) {
+                if (!n->name)
+                        continue;
+                if (n->name->uptr == uptr)
+                        return n->name;
+        }
+        return NULL;
+}
+/**
 * audit_getname - add a name to the list
 * @name: name to add
 *
 * Add a name to the list of audit names for this context.
 * Called from fs/namei.c:getname().
 */
-void __audit_getname(const char *name)
+void __audit_getname(struct filename *name)
 {
        struct audit_context *context = current->audit_context;
        struct audit_names *n;
@@ -2054,13 +2064,19 @@ void __audit_getname(const char *name)
                return;
        }
-        n = audit_alloc_name(context);
+#if AUDIT_DEBUG
+        /* The filename _must_ have a populated ->name */
+        BUG_ON(!name->name);
+#endif
+        n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
        if (!n)
                return;
        n->name = name;
        n->name_len = AUDIT_NAME_FULL;
        n->name_put = true;
+        name->aname = n;
        if (!context->pwd.dentry)
                get_fs_pwd(current->fs, &context->pwd);
@@ -2073,7 +2089,7 @@ void __audit_getname(const char *name)
 * then we delay the putname until syscall exit.
 * Called from include/linux/fs.h:putname().
 */
-void audit_putname(const char *name)
+void audit_putname(struct filename *name)
 {
        struct audit_context *context = current->audit_context;
@@ -2088,7 +2104,7 @@ void audit_putname(const char *name)
                        list_for_each_entry(n, &context->names_list, list)
                                printk(KERN_ERR "name[%d] = %p = %s\n", i,
-                                       n->name, n->name ?: "(null)");
+                                       n->name, n->name->name ?: "(null)");
                        }
 #endif
                __putname(name);
@@ -2102,8 +2118,8 @@ void audit_putname(const char *name)
                               " put_count=%d\n",
                               __FILE__, __LINE__,
                               context->serial, context->major,
-                               context->in_syscall, name, context->name_count,
+                               context->in_syscall, name->name,
-                               context->put_count);
+                               context->name_count, context->put_count);
                        dump_stack();
                }
        }
@@ -2146,13 +2162,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
 }
 /**
- * audit_inode - store the inode and device from a lookup
+ * __audit_inode - store the inode and device from a lookup
 * @name: name being audited
 * @dentry: dentry being audited
- *
+ * @parent: does this dentry represent the parent?
- * Called from fs/namei.c:path_lookup().
 */
-void __audit_inode(const char *name, const struct dentry *dentry)
+void __audit_inode(struct filename *name, const struct dentry *dentry,
+                   unsigned int parent)
 {
        struct audit_context *context = current->audit_context;
        const struct inode *inode = dentry->d_inode;
@@ -2161,24 +2177,69 @@ void __audit_inode(const char *name, const struct dentry *dentry)
        if (!context->in_syscall)
                return;
+        if (!name)
+                goto out_alloc;
+#if AUDIT_DEBUG
+        /* The struct filename _must_ have a populated ->name */
+        BUG_ON(!name->name);
+#endif
+        /*
+         * If we have a pointer to an audit_names entry already, then we can
+         * just use it directly if the type is correct.
+         */
+        n = name->aname;
+        if (n) {
+                if (parent) {
+                        if (n->type == AUDIT_TYPE_PARENT ||
+                            n->type == AUDIT_TYPE_UNKNOWN)
+                                goto out;
+                } else {
+                        if (n->type != AUDIT_TYPE_PARENT)
+                                goto out;
+                }
+        }
        list_for_each_entry_reverse(n, &context->names_list, list) {
-                if (n->name && (n->name == name))
+                /* does the name pointer match? */
-                        goto out;
+                if (!n->name || n->name->name != name->name)
+                        continue;
+                /* match the correct record type */
+                if (parent) {
+                        if (n->type == AUDIT_TYPE_PARENT ||
+                            n->type == AUDIT_TYPE_UNKNOWN)
+                                goto out;
+                } else {
+                        if (n->type != AUDIT_TYPE_PARENT)
+                                goto out;
+                }
        }
-        /* unable to find the name from a previous getname() */
+out_alloc:
-        n = audit_alloc_name(context);
+        /* unable to find the name from a previous getname(). Allocate a new
+         * anonymous entry.
+         */
+        n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
        if (!n)
                return;
 out:
+        if (parent) {
+                n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
+                n->type = AUDIT_TYPE_PARENT;
+        } else {
+                n->name_len = AUDIT_NAME_FULL;
+                n->type = AUDIT_TYPE_NORMAL;
+        }
        handle_path(dentry);
        audit_copy_inode(n, dentry, inode);
 }
 /**
- * audit_inode_child - collect inode info for created/removed objects
+ * __audit_inode_child - collect inode info for created/removed objects
- * @dentry: dentry being audited
 * @parent: inode of dentry parent
+ * @dentry: dentry being audited
+ * @type:   AUDIT_TYPE_* value that we're looking for
 *
 * For syscalls that create or remove filesystem objects, audit_inode
 * can only collect information for the filesystem object's parent.
@@ -2188,15 +2249,14 @@ out:
 * must be hooked prior, in order to capture the target inode during
 * unsuccessful attempts.
 */
-void __audit_inode_child(const struct dentry *dentry,
+void __audit_inode_child(const struct inode *parent,
-                         const struct inode *parent)
+                         const struct dentry *dentry,
+                         const unsigned char type)
 {
        struct audit_context *context = current->audit_context;
-        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
        const char *dname = dentry->d_name.name;
-        struct audit_names *n;
+        struct audit_names *n, *found_parent = NULL, *found_child = NULL;
-        int dirlen = 0;
        if (!context->in_syscall)
                return;
@@ -2204,62 +2264,65 @@ void __audit_inode_child(const struct dentry *dentry,
        if (inode)
                handle_one(inode);
-        /* parent is more likely, look for it first */
+        /* look for a parent entry first */
        list_for_each_entry(n, &context->names_list, list) {
-                if (!n->name)
+                if (!n->name || n->type != AUDIT_TYPE_PARENT)
                        continue;
                if (n->ino == parent->i_ino &&
-                    !audit_compare_dname_path(dname, n->name, &dirlen)) {
+                    !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
-                        n->name_len = dirlen; /* update parent data in place */
+                        found_parent = n;
-                        found_parent = n->name;
+                        break;
-                        goto add_names;
                }
        }
-        /* no matching parent, look for matching child */
+        /* is there a matching child entry? */
        list_for_each_entry(n, &context->names_list, list) {
-                if (!n->name)
+                /* can only match entries that have a name */
+                if (!n->name || n->type != type)
                        continue;
-                /* strcmp() is the more likely scenario */
+                /* if we found a parent, make sure this one is a child of it */
-                if (!strcmp(dname, n->name) ||
+                if (found_parent && (n->name != found_parent->name))
-                     !audit_compare_dname_path(dname, n->name, &dirlen)) {
+                        continue;
-                        if (inode)
-                                audit_copy_inode(n, NULL, inode);
+                if (!strcmp(dname, n->name->name) ||
-                        else
+                    !audit_compare_dname_path(dname, n->name->name,
-                                n->ino = (unsigned long)-1;
+                                                found_parent ?
-                        found_child = n->name;
+                                                found_parent->name_len :
-                        goto add_names;
+                                                AUDIT_NAME_FULL)) {
+                        found_child = n;
+                        break;
                }
        }
-add_names:
        if (!found_parent) {
-                n = audit_alloc_name(context);
+                /* create a new, "anonymous" parent record */
+                n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
                if (!n)
                        return;
                audit_copy_inode(n, NULL, parent);
        }
        if (!found_child) {
-                n = audit_alloc_name(context);
+                found_child = audit_alloc_name(context, type);
-                if (!n)
+                if (!found_child)
                        return;
                /* Re-use the name belonging to the slot for a matching parent
                 * directory. All names for this context are relinquished in
                 * audit_free_names() */
                if (found_parent) {
-                        n->name = found_parent;
+                        found_child->name = found_parent->name;
-                        n->name_len = AUDIT_NAME_FULL;
+                        found_child->name_len = AUDIT_NAME_FULL;
                        /* don't call __putname() */
-                        n->name_put = false;
+                        found_child->name_put = false;
                }
-                if (inode)
-                        audit_copy_inode(n, NULL, inode);
        }
+        if (inode)
+                audit_copy_inode(found_child, dentry, inode);
+        else
+                found_child->ino = (unsigned long)-1;
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2299,14 +2362,14 @@ static atomic_t session_id = ATOMIC_INIT(0);
 *
 * Called (set) from fs/proc/base.c::proc_loginuid_write().
 */
-int audit_set_loginuid(uid_t loginuid)
+int audit_set_loginuid(kuid_t loginuid)
 {
        struct task_struct *task = current;
        struct audit_context *context = task->audit_context;
        unsigned int sessionid;
 #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
-        if (task->loginuid != -1)
+        if (uid_valid(task->loginuid))
                return -EPERM;
 #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
        if (!capable(CAP_AUDIT_CONTROL))
@@ -2322,8 +2385,10 @@ int audit_set_loginuid(uid_t loginuid)
                        audit_log_format(ab, "login pid=%d uid=%u "
                                "old auid=%u new auid=%u"
                                " old ses=%u new ses=%u",
-                                task->pid, task_uid(task),
+                                task->pid,
-                                task->loginuid, loginuid,
+                                from_kuid(&init_user_ns, task_uid(task)),
+                                from_kuid(&init_user_ns, task->loginuid),
+                                from_kuid(&init_user_ns, loginuid),
                                task->sessionid, sessionid);
                        audit_log_end(ab);
                }
@@ -2546,12 +2611,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
        struct audit_aux_data_pids *axp;
        struct task_struct *tsk = current;
        struct audit_context *ctx = tsk->audit_context;
-        uid_t uid = current_uid(), t_uid = task_uid(t);
+        kuid_t uid = current_uid(), t_uid = task_uid(t);
        if (audit_pid && t->tgid == audit_pid) {
                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
                        audit_sig_pid = tsk->pid;
-                        if (tsk->loginuid != -1)
+                        if (uid_valid(tsk->loginuid))
                                audit_sig_uid = tsk->loginuid;
                        else
                                audit_sig_uid = uid;
@@ -2672,8 +2737,8 @@ void __audit_mmap_fd(int fd, int flags)
 static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
 {
-        uid_t auid, uid;
+        kuid_t auid, uid;
-        gid_t gid;
+        kgid_t gid;
        unsigned int sessionid;
        auid = audit_get_loginuid(current);
@@ -2681,7 +2746,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
        current_uid_gid(&uid, &gid);
        audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-                         auid, uid, gid, sessionid);
+                         from_kuid(&init_user_ns, auid),
+                         from_kuid(&init_user_ns, uid),
+                         from_kgid(&init_user_ns, gid),
+                         sessionid);
        audit_log_task_context(ab);
        audit_log_format(ab, " pid=%d comm=", current->pid);
        audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 79818507e444..13774b3b39aa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
 /*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * populated with the built in subsystems, and modular subsystems are
 * registered after that. The mutable section of this array is protected by
 * cgroup_mutex.
 */
-#define SUBSYS(_x) &_x ## _subsys,
+#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
+#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
 static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
@@ -111,13 +112,13 @@ struct cgroupfs_root {
         * The bitmask of subsystems intended to be attached to this
         * hierarchy
         */
-        unsigned long subsys_bits;
+        unsigned long subsys_mask;
        /* Unique id for this hierarchy. */
        int hierarchy_id;
        /* The bitmask of subsystems currently attached to this hierarchy */
-        unsigned long actual_subsys_bits;
+        unsigned long actual_subsys_mask;
        /* A list running through the attached subsystems */
        struct list_head subsys_list;
@@ -276,7 +277,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
 /* bits in struct cgroupfs_root flags field */
 enum {
-        ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
+        ROOT_NOPREFIX,  /* mounted subsystems have no named prefix */
+        ROOT_XATTR,     /* supports extended attributes */
 };
 static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -556,7 +558,7 @@ static struct css_set *find_existing_css_set(
         * won't change, so no need for locking.
         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                if (root->subsys_bits & (1UL << i)) {
+                if (root->subsys_mask & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
                         * the subsystem state from the new
                         * cgroup */
@@ -824,7 +826,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp);
+static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+                               unsigned long subsys_mask);
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
@@ -912,15 +915,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 */
                BUG_ON(!list_empty(&cgrp->pidlists));
+                simple_xattrs_free(&cgrp->xattrs);
                kfree_rcu(cgrp, rcu_head);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
+                struct cftype *cft = cfe->type;
                WARN_ONCE(!list_empty(&cfe->node) &&
                          cgrp != &cgrp->root->top_cgroup,
                          "cfe still linked for %s\n", cfe->type->name);
                kfree(cfe);
+                simple_xattrs_free(&cft->xattrs);
        }
        iput(inode);
 }
@@ -963,12 +970,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
        return -ENOENT;
 }
-static void cgroup_clear_directory(struct dentry *dir)
+/**
+ * cgroup_clear_directory - selective removal of base and subsystem files
+ * @dir: directory containing the files
+ * @base_files: true if the base files should be removed
+ * @subsys_mask: mask of the subsystem ids whose files should be removed
+ */
+static void cgroup_clear_directory(struct dentry *dir, bool base_files,
+                                   unsigned long subsys_mask)
 {
        struct cgroup *cgrp = __d_cgrp(dir);
+        struct cgroup_subsys *ss;
-        while (!list_empty(&cgrp->files))
+        for_each_subsys(cgrp->root, ss) {
-                cgroup_rm_file(cgrp, NULL);
+                struct cftype_set *set;
+                if (!test_bit(ss->subsys_id, &subsys_mask))
+                        continue;
+                list_for_each_entry(set, &ss->cftsets, node)
+                        cgroup_rm_file(cgrp, set->cfts);
+        }
+        if (base_files) {
+                while (!list_empty(&cgrp->files))
+                        cgroup_rm_file(cgrp, NULL);
+        }
 }
 /*
@@ -977,8 +1001,9 @@ static void cgroup_clear_directory(struct dentry *dir)
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
        struct dentry *parent;
+        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-        cgroup_clear_directory(dentry);
+        cgroup_clear_directory(dentry, true, root->subsys_mask);
        parent = dentry->d_parent;
        spin_lock(&parent->d_lock);
@@ -1022,22 +1047,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
 * returns an error, no reference counts are touched.
 */
 static int rebind_subsystems(struct cgroupfs_root *root,
-                              unsigned long final_bits)
+                              unsigned long final_subsys_mask)
 {
-        unsigned long added_bits, removed_bits;
+        unsigned long added_mask, removed_mask;
        struct cgroup *cgrp = &root->top_cgroup;
        int i;
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
-        removed_bits = root->actual_subsys_bits & ~final_bits;
+        removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
-        added_bits = final_bits & ~root->actual_subsys_bits;
+        added_mask = final_subsys_mask & ~root->actual_subsys_mask;
        /* Check that any added subsystems are currently free */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                unsigned long bit = 1UL << i;
                struct cgroup_subsys *ss = subsys[i];
-                if (!(bit & added_bits))
+                if (!(bit & added_mask))
                        continue;
                /*
                 * Nobody should tell us to do a subsys that doesn't exist:
@@ -1062,7 +1087,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                unsigned long bit = 1UL << i;
-                if (bit & added_bits) {
+                if (bit & added_mask) {
                        /* We're binding this subsystem to this hierarchy */
                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
@@ -1075,7 +1100,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        if (ss->bind)
                                ss->bind(cgrp);
                        /* refcount was already taken, and we're keeping it */
-                } else if (bit & removed_bits) {
+                } else if (bit & removed_mask) {
                        /* We're removing this subsystem */
                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
@@ -1088,7 +1113,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        list_move(&ss->sibling, &rootnode.subsys_list);
                        /* subsystem is now free - drop reference on module */
                        module_put(ss->module);
-                } else if (bit & final_bits) {
+                } else if (bit & final_subsys_mask) {
                        /* Subsystem state should already exist */
                        BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
@@ -1105,7 +1130,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]);
                }
        }
-        root->subsys_bits = root->actual_subsys_bits = final_bits;
+        root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
        synchronize_rcu();
        return 0;
@@ -1121,6 +1146,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
                seq_printf(seq, ",%s", ss->name);
        if (test_bit(ROOT_NOPREFIX, &root->flags))
                seq_puts(seq, ",noprefix");
+        if (test_bit(ROOT_XATTR, &root->flags))
+                seq_puts(seq, ",xattr");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
        if (clone_children(&root->top_cgroup))
@@ -1132,7 +1159,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 }
 struct cgroup_sb_opts {
-        unsigned long subsys_bits;
+        unsigned long subsys_mask;
        unsigned long flags;
        char *release_agent;
        bool clone_children;
@@ -1189,6 +1216,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        opts->clone_children = true;
                        continue;
                }
+                if (!strcmp(token, "xattr")) {
+                        set_bit(ROOT_XATTR, &opts->flags);
+                        continue;
+                }
                if (!strncmp(token, "release_agent=", 14)) {
                        /* Specifying two release agents is forbidden */
                        if (opts->release_agent)
@@ -1237,7 +1268,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        /* Mutually exclusive option 'all' + subsystem name */
                        if (all_ss)
                                return -EINVAL;
-                        set_bit(i, &opts->subsys_bits);
+                        set_bit(i, &opts->subsys_mask);
                        one_ss = true;
                        break;
@@ -1258,7 +1289,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                                continue;
                        if (ss->disabled)
                                continue;
-                        set_bit(i, &opts->subsys_bits);
+                        set_bit(i, &opts->subsys_mask);
                }
        }
@@ -1270,19 +1301,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         * the cpuset subsystem.
         */
        if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
-            (opts->subsys_bits & mask))
+            (opts->subsys_mask & mask))
                return -EINVAL;
        /* Can't specify "none" and some subsystems */
-        if (opts->subsys_bits && opts->none)
+        if (opts->subsys_mask && opts->none)
                return -EINVAL;
        /*
         * We either have to specify by name or by subsystems. (So all
         * empty hierarchies must have a name).
         */
-        if (!opts->subsys_bits && !opts->name)
+        if (!opts->subsys_mask && !opts->name)
                return -EINVAL;
        /*
@@ -1291,10 +1322,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         * take duplicate reference counts on a subsystem that's already used,
         * but rebind_subsystems handles this case.
         */
-        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                unsigned long bit = 1UL << i;
-                if (!(bit & opts->subsys_bits))
+                if (!(bit & opts->subsys_mask))
                        continue;
                if (!try_module_get(subsys[i]->module)) {
                        module_pin_failed = true;
@@ -1307,11 +1338,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                 * raced with a module_delete call, and to the user this is
                 * essentially a "subsystem doesn't exist" case.
                 */
-                for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+                for (i--; i >= 0; i--) {
                        /* drop refcounts only on the ones we took */
                        unsigned long bit = 1UL << i;
-                        if (!(bit & opts->subsys_bits))
+                        if (!(bit & opts->subsys_mask))
                                continue;
                        module_put(subsys[i]->module);
                }
@@ -1321,13 +1352,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        return 0;
 }
-static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+static void drop_parsed_module_refcounts(unsigned long subsys_mask)
 {
        int i;
-        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                unsigned long bit = 1UL << i;
-                if (!(bit & subsys_bits))
+                if (!(bit & subsys_mask))
                        continue;
                module_put(subsys[i]->module);
        }
@@ -1339,6 +1370,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        struct cgroupfs_root *root = sb->s_fs_info;
        struct cgroup *cgrp = &root->top_cgroup;
        struct cgroup_sb_opts opts;
+        unsigned long added_mask, removed_mask;
        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
@@ -1350,27 +1382,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                goto out_unlock;
        /* See feature-removal-schedule.txt */
-        if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
+        if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
                pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
                           task_tgid_nr(current), current->comm);
+        added_mask = opts.subsys_mask & ~root->subsys_mask;
+        removed_mask = root->subsys_mask & ~opts.subsys_mask;
        /* Don't allow flags or name to change at remount */
        if (opts.flags != root->flags ||
            (opts.name && strcmp(opts.name, root->name))) {
                ret = -EINVAL;
-                drop_parsed_module_refcounts(opts.subsys_bits);
+                drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
-        ret = rebind_subsystems(root, opts.subsys_bits);
+        ret = rebind_subsystems(root, opts.subsys_mask);
        if (ret) {
-                drop_parsed_module_refcounts(opts.subsys_bits);
+                drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
        /* clear out any existing files and repopulate subsystem files */
-        cgroup_clear_directory(cgrp->dentry);
+        cgroup_clear_directory(cgrp->dentry, false, removed_mask);
-        cgroup_populate_dir(cgrp);
+        /* re-populate subsystem files */
+        cgroup_populate_dir(cgrp, false, added_mask);
        if (opts.release_agent)
                strcpy(root->release_agent_path, opts.release_agent);
@@ -1401,6 +1437,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        mutex_init(&cgrp->pidlist_mutex);
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
+        simple_xattrs_init(&cgrp->xattrs);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1455,8 +1492,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)
         * If we asked for subsystems (or explicitly for no
         * subsystems) then they must match
         */
-        if ((opts->subsys_bits || opts->none)
+        if ((opts->subsys_mask || opts->none)
-            && (opts->subsys_bits != root->subsys_bits))
+            && (opts->subsys_mask != root->subsys_mask))
                return 0;
        return 1;
@@ -1466,7 +1503,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 {
        struct cgroupfs_root *root;
-        if (!opts->subsys_bits && !opts->none)
+        if (!opts->subsys_mask && !opts->none)
                return NULL;
        root = kzalloc(sizeof(*root), GFP_KERNEL);
@@ -1479,7 +1516,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
        }
        init_cgroup_root(root);
-        root->subsys_bits = opts->subsys_bits;
+        root->subsys_mask = opts->subsys_mask;
        root->flags = opts->flags;
        if (opts->release_agent)
                strcpy(root->release_agent_path, opts->release_agent);
@@ -1511,7 +1548,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
        if (!opts->new_root)
                return -EINVAL;
-        BUG_ON(!opts->subsys_bits && !opts->none);
+        BUG_ON(!opts->subsys_mask && !opts->none);
        ret = set_anon_super(sb, NULL);
        if (ret)
@@ -1629,7 +1666,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                if (ret)
                        goto unlock_drop;
-                ret = rebind_subsystems(root, root->subsys_bits);
+                ret = rebind_subsystems(root, root->subsys_mask);
                if (ret == -EBUSY) {
                        free_cg_links(&tmp_cg_links);
                        goto unlock_drop;
@@ -1669,7 +1706,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                BUG_ON(root->number_of_cgroups != 1);
                cred = override_creds(&init_cred);
-                cgroup_populate_dir(root_cgrp);
+                cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
                revert_creds(cred);
                mutex_unlock(&cgroup_root_mutex);
                mutex_unlock(&cgroup_mutex);
@@ -1681,7 +1718,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 */
                cgroup_drop_root(opts.new_root);
                /* no subsys rebinding, so refcounts don't change */
-                drop_parsed_module_refcounts(opts.subsys_bits);
+                drop_parsed_module_refcounts(opts.subsys_mask);
        }
        kfree(opts.release_agent);
@@ -1695,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 drop_new_super:
        deactivate_locked_super(sb);
 drop_modules:
-        drop_parsed_module_refcounts(opts.subsys_bits);
+        drop_parsed_module_refcounts(opts.subsys_mask);
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -1745,6 +1782,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
+        simple_xattrs_free(&cgrp->xattrs);
        kill_litter_super(sb);
        cgroup_drop_root(root);
 }
@@ -2551,6 +2590,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
        return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
+static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
+{
+        if (S_ISDIR(dentry->d_inode->i_mode))
+                return &__d_cgrp(dentry)->xattrs;
+        else
+                return &__d_cft(dentry)->xattrs;
+}
+static inline int xattr_enabled(struct dentry *dentry)
+{
+        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+        return test_bit(ROOT_XATTR, &root->flags);
+}
+static bool is_valid_xattr(const char *name)
+{
+        if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+            !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+                return true;
+        return false;
+}
+static int cgroup_setxattr(struct dentry *dentry, const char *name,
+                           const void *val, size_t size, int flags)
+{
+        if (!xattr_enabled(dentry))
+                return -EOPNOTSUPP;
+        if (!is_valid_xattr(name))
+                return -EINVAL;
+        return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
+}
+static int cgroup_removexattr(struct dentry *dentry, const char *name)
+{
+        if (!xattr_enabled(dentry))
+                return -EOPNOTSUPP;
+        if (!is_valid_xattr(name))
+                return -EINVAL;
+        return simple_xattr_remove(__d_xattrs(dentry), name);
+}
+static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
+                               void *buf, size_t size)
+{
+        if (!xattr_enabled(dentry))
+                return -EOPNOTSUPP;
+        if (!is_valid_xattr(name))
+                return -EINVAL;
+        return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
+}
+static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+        if (!xattr_enabled(dentry))
+                return -EOPNOTSUPP;
+        return simple_xattr_list(__d_xattrs(dentry), buf, size);
+}
 static const struct file_operations cgroup_file_operations = {
        .read = cgroup_file_read,
        .write = cgroup_file_write,
@@ -2559,11 +2656,22 @@ static const struct file_operations cgroup_file_operations = {
        .release = cgroup_file_release,
 };
+static const struct inode_operations cgroup_file_inode_operations = {
+        .setxattr = cgroup_setxattr,
+        .getxattr = cgroup_getxattr,
+        .listxattr = cgroup_listxattr,
+        .removexattr = cgroup_removexattr,
+};
 static const struct inode_operations cgroup_dir_inode_operations = {
        .lookup = cgroup_lookup,
        .mkdir = cgroup_mkdir,
        .rmdir = cgroup_rmdir,
        .rename = cgroup_rename,
+        .setxattr = cgroup_setxattr,
+        .getxattr = cgroup_getxattr,
+        .listxattr = cgroup_listxattr,
+        .removexattr = cgroup_removexattr,
 };
 static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -2611,6 +2719,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
        } else if (S_ISREG(mode)) {
                inode->i_size = 0;
                inode->i_fop = &cgroup_file_operations;
+                inode->i_op = &cgroup_file_inode_operations;
        }
        d_instantiate(dentry, inode);
        dget(dentry);   /* Extra count - pin the dentry in core */
@@ -2671,7 +2780,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 }
 static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                           const struct cftype *cft)
+                           struct cftype *cft)
 {
        struct dentry *dir = cgrp->dentry;
        struct cgroup *parent = __d_cgrp(dir);
@@ -2681,6 +2790,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
        umode_t mode;
        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+        simple_xattrs_init(&cft->xattrs);
        /* does @cft->flags tell us to skip creation on @cgrp? */
        if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
                return 0;
@@ -2721,9 +2832,9 @@ out:
 }
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                              const struct cftype cfts[], bool is_add)
+                              struct cftype cfts[], bool is_add)
 {
-        const struct cftype *cft;
+        struct cftype *cft;
        int err, ret = 0;
        for (cft = cfts; cft->name[0] != '\0'; cft++) {
@@ -2757,7 +2868,7 @@ static void cgroup_cfts_prepare(void)
 }
 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
-                               const struct cftype *cfts, bool is_add)
+                               struct cftype *cfts, bool is_add)
        __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
 {
        LIST_HEAD(pending);
@@ -2808,7 +2919,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
-int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        struct cftype_set *set;
@@ -2838,7 +2949,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
 * registered with @ss.
 */
-int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        struct cftype_set *set;
@@ -3843,18 +3954,29 @@ static struct cftype files[] = {
        { }     /* terminate */
 };
-static int cgroup_populate_dir(struct cgroup *cgrp)
+/**
+ * cgroup_populate_dir - selectively creation of files in a directory
+ * @cgrp: target cgroup
+ * @base_files: true if the base files should be added
+ * @subsys_mask: mask of the subsystem ids whose files should be added
+ */
+static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+                               unsigned long subsys_mask)
 {
        int err;
        struct cgroup_subsys *ss;
-        err = cgroup_addrm_files(cgrp, NULL, files, true);
+        if (base_files) {
-        if (err < 0)
+                err = cgroup_addrm_files(cgrp, NULL, files, true);
-                return err;
+                if (err < 0)
+                        return err;
+        }
        /* process cftsets of each subsystem */
        for_each_subsys(cgrp->root, ss) {
                struct cftype_set *set;
+                if (!test_bit(ss->subsys_id, &subsys_mask))
+                        continue;
                list_for_each_entry(set, &ss->cftsets, node)
                        cgroup_addrm_files(cgrp, ss, set->cfts, true);
@@ -3954,8 +4076,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
        for_each_subsys(root, ss) {
-                struct cgroup_subsys_state *css = ss->create(cgrp);
+                struct cgroup_subsys_state *css;
+                css = ss->create(cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
                        goto err_destroy;
@@ -3969,6 +4092,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                /* At error, ->destroy() callback has to free assigned ID. */
                if (clone_children(parent) && ss->post_clone)
                        ss->post_clone(cgrp);
+                if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+                    parent->parent) {
+                        pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+                                   current->comm, current->pid, ss->name);
+                        if (!strcmp(ss->name, "memory"))
+                                pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+                        ss->warned_broken_hierarchy = true;
+                }
        }
        list_add(&cgrp->sibling, &cgrp->parent->children);
@@ -3988,7 +4120,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
-        err = cgroup_populate_dir(cgrp);
+        err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
        /* If err < 0, we have a half-filled directory - oh well ;) */
        mutex_unlock(&cgroup_mutex);
@@ -4321,8 +4453,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * since cgroup_init_subsys will have already taken care of it.
         */
        if (ss->module == NULL) {
-                /* a few sanity checks */
+                /* a sanity check */
-                BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
                BUG_ON(subsys[ss->subsys_id] != ss);
                return 0;
        }
@@ -4330,24 +4461,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        /* init base cftset */
        cgroup_init_cftsets(ss);
-        /*
-         * need to register a subsys id before anything else - for example,
-         * init_cgroup_css needs it.
-         */
        mutex_lock(&cgroup_mutex);
-        /* find the first empty slot in the array */
+        subsys[ss->subsys_id] = ss;
-        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
-                if (subsys[i] == NULL)
-                        break;
-        }
-        if (i == CGROUP_SUBSYS_COUNT) {
-                /* maximum number of subsystems already registered! */
-                mutex_unlock(&cgroup_mutex);
-                return -EBUSY;
-        }
-        /* assign ourselves the subsys_id */
-        ss->subsys_id = i;
-        subsys[i] = ss;
        /*
         * no ss->create seems to need anything important in the ss struct, so
@@ -4356,7 +4471,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        css = ss->create(dummytop);
        if (IS_ERR(css)) {
                /* failure case - need to deassign the subsys[] slot. */
-                subsys[i] = NULL;
+                subsys[ss->subsys_id] = NULL;
                mutex_unlock(&cgroup_mutex);
                return PTR_ERR(css);
        }
@@ -4372,7 +4487,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
                if (ret) {
                        dummytop->subsys[ss->subsys_id] = NULL;
                        ss->destroy(dummytop);
-                        subsys[i] = NULL;
+                        subsys[ss->subsys_id] = NULL;
                        mutex_unlock(&cgroup_mutex);
                        return ret;
                }
@@ -4439,7 +4554,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        mutex_lock(&cgroup_mutex);
        /* deassign the subsys_id */
-        BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
        subsys[ss->subsys_id] = NULL;
        /* remove subsystem from rootnode's list of subsystems */
@@ -4502,10 +4616,13 @@ int __init cgroup_init_early(void)
        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
                INIT_HLIST_HEAD(&css_set_table[i]);
-        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                /* at bootup time, we don't worry about modular subsystems */
+                if (!ss || ss->module)
+                        continue;
                BUG_ON(!ss->name);
                BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
                BUG_ON(!ss->create);
@@ -4538,9 +4655,12 @@ int __init cgroup_init(void)
        if (err)
                return err;
-        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                /* at bootup time, we don't worry about modular subsystems */
+                if (!ss || ss->module)
+                        continue;
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
@@ -4735,13 +4855,16 @@ void cgroup_fork_callbacks(struct task_struct *child)
 {
        if (need_forkexit_callback) {
                int i;
-                /*
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                 * forkexit callbacks are only supported for builtin
-                 * subsystems, and the builtin section of the subsys array is
-                 * immutable, so we don't need to lock the subsys array here.
-                 */
-                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
+                        /*
+                         * forkexit callbacks are only supported for
+                         * builtin subsystems.
+                         */
+                        if (!ss || ss->module)
+                                continue;
                        if (ss->fork)
                                ss->fork(child);
                }
@@ -4846,12 +4969,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        tsk->cgroups = &init_css_set;
        if (run_callbacks && need_forkexit_callback) {
-                /*
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                 * modular subsystems can't use callbacks, so no need to lock
-                 * the subsys array
-                 */
-                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
+                        /* modular subsystems can't use callbacks */
+                        if (!ss || ss->module)
+                                continue;
                        if (ss->exit) {
                                struct cgroup *old_cgrp =
                                        rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5037,13 +5161,17 @@ static int __init cgroup_disable(char *str)
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
-                /*
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                 * cgroup_disable, being at boot time, can't know about module
-                 * subsystems, so we don't worry about them.
-                 */
-                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
+                        /*
+                         * cgroup_disable, being at boot time, can't
+                         * know about module subsystems, so we don't
+                         * worry about them.
+                         */
+                        if (!ss || ss->module)
+                                continue;
                        if (!strcmp(token, ss->name)) {
                                ss->disabled = 1;
                                printk(KERN_INFO "Disabling %s control group"
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 3649fc6b3eaa..b1724ce98981 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = {
        .can_attach     = freezer_can_attach,
        .fork           = freezer_fork,
        .base_cftypes   = files,
+        /*
+         * freezer subsys doesn't handle hierarchy at all.  Frozen state
+         * should be inherited through the hierarchy - if a parent is
+         * frozen, all its children should be frozen.  Fix it and remove
+         * the following.
+         */
+        .broken_hierarchy = true,
 };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f560598807c1..42bd331ee0ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,6 +80,10 @@ void put_online_cpus(void)
        if (cpu_hotplug.active_writer == current)
                return;
        mutex_lock(&cpu_hotplug.lock);
+        if (WARN_ON(!cpu_hotplug.refcount))
+                cpu_hotplug.refcount++; /* try to fix things up */
        if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
                wake_up_process(cpu_hotplug.active_writer);
        mutex_unlock(&cpu_hotplug.lock);
diff --git a/kernel/cred.c b/kernel/cred.c
index de728ac50d82..48cea3da6d05 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -799,9 +799,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
               atomic_read(&cred->usage),
               read_cred_subscribers(cred));
        printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
-               cred->uid, cred->euid, cred->suid, cred->fsuid);
+                from_kuid_munged(&init_user_ns, cred->uid),
+                from_kuid_munged(&init_user_ns, cred->euid),
+                from_kuid_munged(&init_user_ns, cred->suid),
+                from_kuid_munged(&init_user_ns, cred->fsuid));
        printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
-               cred->gid, cred->egid, cred->sgid, cred->fsgid);
+                from_kgid_munged(&init_user_ns, cred->gid),
+                from_kgid_munged(&init_user_ns, cred->egid),
+                from_kgid_munged(&init_user_ns, cred->sgid),
+                from_kgid_munged(&init_user_ns, cred->fsgid));
 #ifdef CONFIG_SECURITY
        printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
        if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0557f24c6bca..9a61738cefc8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 {
        struct kgdb_state kgdb_var;
        struct kgdb_state *ks = &kgdb_var;
+        int ret = 0;
+        if (arch_kgdb_ops.enable_nmi)
+                arch_kgdb_ops.enable_nmi(0);
        ks->cpu                 = raw_smp_processor_id();
        ks->ex_vector           = evector;
@@ -681,13 +685,33 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        ks->linux_regs          = regs;
        if (kgdb_reenter_check(ks))
-                return 0; /* Ouch, double exception ! */
+                goto out; /* Ouch, double exception ! */
        if (kgdb_info[ks->cpu].enter_kgdb != 0)
-                return 0;
+                goto out;
-        return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+        ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+out:
+        if (arch_kgdb_ops.enable_nmi)
+                arch_kgdb_ops.enable_nmi(1);
+        return ret;
 }
+/*
+ * GDB places a breakpoint at this function to know dynamically
+ * loaded objects. It's not defined static so that only one instance with this
+ * name exists in the kernel.
+ */
+static int module_event(struct notifier_block *self, unsigned long val,
+        void *data)
+{
+        return 0;
+}
+static struct notifier_block dbg_module_load_nb = {
+        .notifier_call  = module_event,
+};
 int kgdb_nmicallback(int cpu, void *regs)
 {
 #ifdef CONFIG_SMP
@@ -816,6 +840,7 @@ static void kgdb_register_callbacks(void)
                kgdb_arch_init();
                if (!dbg_is_early)
                        kgdb_arch_late();
+                register_module_notifier(&dbg_module_load_nb);
                register_reboot_notifier(&dbg_reboot_notifier);
                atomic_notifier_chain_register(&panic_notifier_list,
                                               &kgdb_panic_event_nb);
@@ -839,6 +864,7 @@ static void kgdb_unregister_callbacks(void)
        if (kgdb_io_module_registered) {
                kgdb_io_module_registered = 0;
                unregister_reboot_notifier(&dbg_reboot_notifier);
+                unregister_module_notifier(&dbg_module_load_nb);
                atomic_notifier_chain_unregister(&panic_notifier_list,
                                               &kgdb_panic_event_nb);
                kgdb_arch_exit();
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 07c9bbb94a0b..b03e0e814e43 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv)
                }
                /* Now the inactive tasks */
                kdb_do_each_thread(g, p) {
+                        if (KDB_FLAG(CMD_INTERRUPT))
+                                return 0;
                        if (task_curr(p))
                                continue;
                        if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 0a69d2adc4f3..14ff4849262c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)
 {
        int diag;
        int linecount;
+        int colcount;
        int logging, saved_loglevel = 0;
        int saved_trap_printk;
        int got_printf_lock = 0;
@@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)
        if (diag || linecount <= 1)
                linecount = 24;
+        diag = kdbgetintenv("COLUMNS", &colcount);
+        if (diag || colcount <= 1)
+                colcount = 80;
        diag = kdbgetintenv("LOGGING", &logging);
        if (diag)
                logging = 0;
@@ -690,7 +695,7 @@ kdb_printit:
                gdbstub_msg_write(kdb_buffer, retlen);
        } else {
                if (dbg_io_ops && !dbg_io_ops->is_console) {
-                        len = strlen(kdb_buffer);
+                        len = retlen;
                        cp = kdb_buffer;
                        while (len--) {
                                dbg_io_ops->write_char(*cp);
@@ -709,11 +714,29 @@ kdb_printit:
                printk(KERN_INFO "%s", kdb_buffer);
        }
-        if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
+        if (KDB_STATE(PAGER)) {
-                kdb_nextline++;
+                /*
+                 * Check printed string to decide how to bump the
+                 * kdb_nextline to control when the more prompt should
+                 * show up.
+                 */
+                int got = 0;
+                len = retlen;
+                while (len--) {
+                        if (kdb_buffer[len] == '\n') {
+                                kdb_nextline++;
+                                got = 0;
+                        } else if (kdb_buffer[len] == '\r') {
+                                got = 0;
+                        } else {
+                                got++;
+                        }
+                }
+                kdb_nextline += got / (colcount + 1);
+        }
        /* check for having reached the LINES number of printed lines */
-        if (kdb_nextline == linecount) {
+        if (kdb_nextline >= linecount) {
                char buf1[16] = "";
                /* Watch out for recursion here.  Any routine that calls
@@ -765,7 +788,7 @@ kdb_printit:
                        kdb_grepping_flag = 0;
                        kdb_printf("\n");
                } else if (buf1[0] == ' ') {
-                        kdb_printf("\n");
+                        kdb_printf("\r");
                        suspend_grep = 1; /* for this recursion */
                } else if (buf1[0] == '\n') {
                        kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 31df1706b9a9..4d5f8d5612f3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -21,6 +21,7 @@
 #include <linux/smp.h>
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
+#include <linux/atomic.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/init.h>
@@ -2100,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv)
                }
                if (!lines--)
                        break;
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
                kdb_printf("%.*s\n", (int)len - 1, buf);
        }
@@ -2107,6 +2110,32 @@ static int kdb_dmesg(int argc, const char **argv)
        return 0;
 }
 #endif /* CONFIG_PRINTK */
+/* Make sure we balance enable/disable calls, must disable first. */
+static atomic_t kdb_nmi_disabled;
+static int kdb_disable_nmi(int argc, const char *argv[])
+{
+        if (atomic_read(&kdb_nmi_disabled))
+                return 0;
+        atomic_set(&kdb_nmi_disabled, 1);
+        arch_kgdb_ops.enable_nmi(0);
+        return 0;
+}
+static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
+{
+        if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
+                return -EINVAL;
+        arch_kgdb_ops.enable_nmi(1);
+        return 0;
+}
+static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
+        .set = kdb_param_enable_nmi,
+};
+module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
 /*
 * kdb_cpu - This function implements the 'cpu' command.
 *      cpu     [<cpunum>]
@@ -2851,6 +2880,10 @@ static void __init kdb_inittab(void)
        kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
          "Display syslog buffer", 0, KDB_REPEAT_NONE);
 #endif
+        if (arch_kgdb_ops.enable_nmi) {
+                kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+                  "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+        }
        kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
          "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fd15593c7f54..dbccf83c134d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -471,14 +471,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 {
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
-        struct file *file;
+        struct fd f = fdget(fd);
-        int ret = 0, fput_needed;
+        int ret = 0;
-        file = fget_light(fd, &fput_needed);
+        if (!f.file)
-        if (!file)
                return -EBADF;
-        css = cgroup_css_from_dir(file, perf_subsys_id);
+        css = cgroup_css_from_dir(f.file, perf_subsys_id);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
                goto out;
@@ -504,7 +503,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                ret = -EINVAL;
        }
 out:
-        fput_light(file, fput_needed);
+        fdput(f);
        return ret;
 }
@@ -3237,21 +3236,18 @@ unlock:
 static const struct file_operations perf_fops;
-static struct file *perf_fget_light(int fd, int *fput_needed)
+static inline int perf_fget_light(int fd, struct fd *p)
 {
-        struct file *file;
+        struct fd f = fdget(fd);
+        if (!f.file)
-        file = fget_light(fd, fput_needed);
+                return -EBADF;
-        if (!file)
-                return ERR_PTR(-EBADF);
-        if (file->f_op != &perf_fops) {
+        if (f.file->f_op != &perf_fops) {
-                fput_light(file, *fput_needed);
+                fdput(f);
-                *fput_needed = 0;
+                return -EBADF;
-                return ERR_PTR(-EBADF);
        }
+        *p = f;
-        return file;
+        return 0;
 }
 static int perf_event_set_output(struct perf_event *event,
@@ -3283,22 +3279,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_OUTPUT:
        {
-                struct file *output_file = NULL;
-                struct perf_event *output_event = NULL;
-                int fput_needed = 0;
                int ret;
                if (arg != -1) {
-                        output_file = perf_fget_light(arg, &fput_needed);
+                        struct perf_event *output_event;
-                        if (IS_ERR(output_file))
+                        struct fd output;
-                                return PTR_ERR(output_file);
+                        ret = perf_fget_light(arg, &output);
-                        output_event = output_file->private_data;
+                        if (ret)
+                                return ret;
+                        output_event = output.file->private_data;
+                        ret = perf_event_set_output(event, output_event);
+                        fdput(output);
+                } else {
+                        ret = perf_event_set_output(event, NULL);
                }
-                ret = perf_event_set_output(event, output_event);
-                if (output_event)
-                        fput_light(output_file, fput_needed);
                return ret;
        }
@@ -3681,7 +3674,7 @@ unlock:
                atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
-        vma->vm_flags |= VM_RESERVED;
+        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;
        return ret;
@@ -6446,12 +6439,11 @@ SYSCALL_DEFINE5(perf_event_open,
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
-        struct file *group_file = NULL;
+        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
        struct pmu *pmu;
        int event_fd;
        int move_group = 0;
-        int fput_needed = 0;
        int err;
        /* for future expandability... */
@@ -6481,17 +6473,15 @@ SYSCALL_DEFINE5(perf_event_open,
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;
-        event_fd = get_unused_fd_flags(O_RDWR);
+        event_fd = get_unused_fd();
        if (event_fd < 0)
                return event_fd;
        if (group_fd != -1) {
-                group_file = perf_fget_light(group_fd, &fput_needed);
+                err = perf_fget_light(group_fd, &group);
-                if (IS_ERR(group_file)) {
+                if (err)
-                        err = PTR_ERR(group_file);
                        goto err_fd;
-                }
+                group_leader = group.file->private_data;
-                group_leader = group_file->private_data;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6667,7 +6657,7 @@ SYSCALL_DEFINE5(perf_event_open,
         * of the group leader will find the pointer to itself in
         * perf_group_detach().
         */
-        fput_light(group_file, fput_needed);
+        fdput(group);
        fd_install(event_fd, event_file);
        return event_fd;
@@ -6681,7 +6671,7 @@ err_task:
        if (task)
                put_task_struct(task);
 err_group_fd:
-        fput_light(group_file, fput_needed);
+        fdput(group);
 err_fd:
        put_unused_fd(event_fd);
        return err;
@@ -7506,5 +7496,12 @@ struct cgroup_subsys perf_subsys = {
        .destroy        = perf_cgroup_destroy,
        .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
+        /*
+         * perf_event cgroup doesn't handle nesting correctly.
+         * ctx->nr_cgroups adjustments should be propagated through the
+         * cgroup hierarchy.  Fix it and remove the following.
+         */
+        .broken_hierarchy = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 912ef48d28ab..5cc4e7e42e68 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -78,15 +78,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 */
 static atomic_t uprobe_events = ATOMIC_INIT(0);
+/* Have a copy of original instruction */
+#define UPROBE_COPY_INSN        0
+/* Dont run handlers when first register/ last unregister in progress*/
+#define UPROBE_RUN_HANDLER      1
+/* Can skip singlestep */
+#define UPROBE_SKIP_SSTEP       2
 struct uprobe {
        struct rb_node          rb_node;        /* node in the rb tree */
        atomic_t                ref;
        struct rw_semaphore     consumer_rwsem;
+        struct mutex            copy_mutex;     /* TODO: kill me and UPROBE_COPY_INSN */
        struct list_head        pending_list;
        struct uprobe_consumer  *consumers;
        struct inode            *inode;         /* Also hold a ref to inode */
        loff_t                  offset;
-        int                     flags;
+        unsigned long           flags;
        struct arch_uprobe      arch;
 };
@@ -100,17 +108,12 @@ struct uprobe {
 */
 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 {
-        if (!vma->vm_file)
+        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
-                return false;
-        if (!is_register)
-                return true;
-        if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
+        if (is_register)
-                                == (VM_READ|VM_EXEC))
+                flags |= VM_WRITE;
-                return true;
-        return false;
+        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
 }
 static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
@@ -141,10 +144,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        spinlock_t *ptl;
        pte_t *ptep;
        int err;
+        /* For mmu_notifiers */
+        const unsigned long mmun_start = addr;
+        const unsigned long mmun_end   = addr + PAGE_SIZE;
        /* For try_to_free_swap() and munlock_vma_page() below */
        lock_page(page);
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        err = -EAGAIN;
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
@@ -173,6 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        err = 0;
 unlock:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
 }
@@ -188,19 +196,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
        return *insn == UPROBE_SWBP_INSN;
 }
+static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
+{
+        void *kaddr = kmap_atomic(page);
+        memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
+        kunmap_atomic(kaddr);
+}
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+{
+        uprobe_opcode_t old_opcode;
+        bool is_swbp;
+        copy_opcode(page, vaddr, &old_opcode);
+        is_swbp = is_swbp_insn(&old_opcode);
+        if (is_swbp_insn(new_opcode)) {
+                if (is_swbp)            /* register: already installed? */
+                        return 0;
+        } else {
+                if (!is_swbp)           /* unregister: was it changed by us? */
+                        return 0;
+        }
+        return 1;
+}
 /*
 * NOTE:
 * Expect the breakpoint instruction to be the smallest size instruction for
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify read_opcode /
+ * supported by that architecture then we need to modify is_swbp_at_addr and
 * write_opcode accordingly. This would never be a problem for archs that
 * have fixed length instructions.
 */
 /*
 * write_opcode - write the opcode at a given virtual address.
- * @auprobe: arch breakpointing information.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
@@ -211,8 +244,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
 * For mm @mm, write the opcode at @vaddr.
 * Return 0 (success) or a negative errno.
 */
-static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
+static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
-                        unsigned long vaddr, uprobe_opcode_t opcode)
+                        uprobe_opcode_t opcode)
 {
        struct page *old_page, *new_page;
        void *vaddr_old, *vaddr_new;
@@ -221,10 +254,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 retry:
        /* Read the page with vaddr into memory */
-        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
+        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
        if (ret <= 0)
                return ret;
+        ret = verify_opcode(old_page, vaddr, &opcode);
+        if (ret <= 0)
+                goto put_old;
        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
@@ -259,63 +296,6 @@ put_old:
 }
 /**
- * read_opcode - read the opcode at a given virtual address.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to read the opcode.
- * @opcode: location to store the read opcode.
- *
- * Called with mm->mmap_sem held (for read and with a reference to
- * mm.
- *
- * For mm @mm, read the opcode at @vaddr and store it in @opcode.
- * Return 0 (success) or a negative errno.
- */
-static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
-{
-        struct page *page;
-        void *vaddr_new;
-        int ret;
-        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
-        if (ret <= 0)
-                return ret;
-        vaddr_new = kmap_atomic(page);
-        vaddr &= ~PAGE_MASK;
-        memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
-        kunmap_atomic(vaddr_new);
-        put_page(page);
-        return 0;
-}
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
-{
-        uprobe_opcode_t opcode;
-        int result;
-        if (current->mm == mm) {
-                pagefault_disable();
-                result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
-                                                                sizeof(opcode));
-                pagefault_enable();
-                if (likely(result == 0))
-                        goto out;
-        }
-        result = read_opcode(mm, vaddr, &opcode);
-        if (result)
-                return result;
-out:
-        if (is_swbp_insn(&opcode))
-                return 1;
-        return 0;
-}
-/**
 * set_swbp - store breakpoint at a given address.
 * @auprobe: arch specific probepoint information.
 * @mm: the probed process address space.
@@ -326,18 +306,7 @@ out:
 */
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        int result;
+        return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
-        /*
-         * See the comment near uprobes_hash().
-         */
-        result = is_swbp_at_addr(mm, vaddr);
-        if (result == 1)
-                return 0;
-        if (result)
-                return result;
-        return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
 }
 /**
@@ -352,16 +321,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
 int __weak
 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        int result;
+        return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
-        result = is_swbp_at_addr(mm, vaddr);
-        if (!result)
-                return -EINVAL;
-        if (result != 1)
-                return result;
-        return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
 }
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -468,7 +428,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
        spin_unlock(&uprobes_treelock);
        /* For now assume that the instruction need not be single-stepped */
-        uprobe->flags |= UPROBE_SKIP_SSTEP;
+        __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
        return u;
 }
@@ -490,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
        uprobe->inode = igrab(inode);
        uprobe->offset = offset;
        init_rwsem(&uprobe->consumer_rwsem);
+        mutex_init(&uprobe->copy_mutex);
        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
@@ -510,7 +471,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 {
        struct uprobe_consumer *uc;
-        if (!(uprobe->flags & UPROBE_RUN_HANDLER))
+        if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
                return;
        down_read(&uprobe->consumer_rwsem);
@@ -616,29 +577,43 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
        return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
 }
-/*
+static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
- * How mm->uprobes_state.count gets updated
+                                struct mm_struct *mm, unsigned long vaddr)
- * uprobe_mmap() increments the count if
+{
- *      - it successfully adds a breakpoint.
+        int ret = 0;
- *      - it cannot add a breakpoint, but sees that there is a underlying
- *        breakpoint (via a is_swbp_at_addr()).
+        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
- *
+                return ret;
- * uprobe_munmap() decrements the count if
- *      - it sees a underlying breakpoint, (via is_swbp_at_addr)
+        mutex_lock(&uprobe->copy_mutex);
- *        (Subsequent uprobe_unregister wouldnt find the breakpoint
+        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
- *        unless a uprobe_mmap kicks in, since the old vma would be
+                goto out;
- *        dropped just after uprobe_munmap.)
- *
+        ret = copy_insn(uprobe, file);
- * uprobe_register increments the count if:
+        if (ret)
- *      - it successfully adds a breakpoint.
+                goto out;
- *
- * uprobe_unregister decrements the count if:
+        ret = -ENOTSUPP;
- *      - it sees a underlying breakpoint and removes successfully.
+        if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
- *        (via is_swbp_at_addr)
+                goto out;
- *        (Subsequent uprobe_munmap wouldnt find the breakpoint
- *        since there is no underlying breakpoint after the
+        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
- *        breakpoint removal.)
+        if (ret)
- */
+                goto out;
+        /* write_opcode() assumes we don't cross page boundary */
+        BUG_ON((uprobe->offset & ~PAGE_MASK) +
+                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+        smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+        set_bit(UPROBE_COPY_INSN, &uprobe->flags);
+ out:
+        mutex_unlock(&uprobe->copy_mutex);
+        return ret;
+}
 static int
 install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long vaddr)
@@ -656,24 +631,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
        if (!uprobe->consumers)
                return 0;
-        if (!(uprobe->flags & UPROBE_COPY_INSN)) {
+        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
-                ret = copy_insn(uprobe, vma->vm_file);
+        if (ret)
-                if (ret)
+                return ret;
-                        return ret;
-                if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
-                        return -ENOTSUPP;
-                ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
-                if (ret)
-                        return ret;
-                /* write_opcode() assumes we don't cross page boundary */
-                BUG_ON((uprobe->offset & ~PAGE_MASK) +
-                                UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-                uprobe->flags |= UPROBE_COPY_INSN;
-        }
        /*
         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
@@ -692,15 +652,15 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
        return ret;
 }
-static void
+static int
 remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 {
        /* can happen if uprobe_register() fails */
        if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
-                return;
+                return 0;
        set_bit(MMF_RECALC_UPROBES, &mm->flags);
-        set_orig_insn(&uprobe->arch, mm, vaddr);
+        return set_orig_insn(&uprobe->arch, mm, vaddr);
 }
 /*
@@ -735,7 +695,6 @@ static struct map_info *
 build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 {
        unsigned long pgoff = offset >> PAGE_SHIFT;
-        struct prio_tree_iter iter;
        struct vm_area_struct *vma;
        struct map_info *curr = NULL;
        struct map_info *prev = NULL;
@@ -744,7 +703,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 again:
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;
@@ -816,7 +775,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
                struct mm_struct *mm = info->mm;
                struct vm_area_struct *vma;
-                if (err)
+                if (err && is_register)
                        goto free;
                down_write(&mm->mmap_sem);
@@ -832,7 +791,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
                if (is_register)
                        err = install_breakpoint(uprobe, mm, vma, info->vaddr);
                else
-                        remove_breakpoint(uprobe, mm, info->vaddr);
+                        err |= remove_breakpoint(uprobe, mm, info->vaddr);
 unlock:
                up_write(&mm->mmap_sem);
@@ -889,13 +848,15 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
        mutex_lock(uprobes_hash(inode));
        uprobe = alloc_uprobe(inode, offset);
-        if (uprobe && !consumer_add(uprobe, uc)) {
+        if (!uprobe) {
+                ret = -ENOMEM;
+        } else if (!consumer_add(uprobe, uc)) {
                ret = __uprobe_register(uprobe);
                if (ret) {
                        uprobe->consumers = NULL;
                        __uprobe_unregister(uprobe);
                } else {
-                        uprobe->flags |= UPROBE_RUN_HANDLER;
+                        set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
                }
        }
@@ -928,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
        if (consumer_del(uprobe, uc)) {
                if (!uprobe->consumers) {
                        __uprobe_unregister(uprobe);
-                        uprobe->flags &= ~UPROBE_RUN_HANDLER;
+                        clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
                }
        }
@@ -1389,10 +1350,11 @@ bool uprobe_deny_signal(void)
 */
 static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
 {
-        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+        if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
-                return true;
+                if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+                        return true;
-        uprobe->flags &= ~UPROBE_SKIP_SSTEP;
+                clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
+        }
        return false;
 }
@@ -1415,6 +1377,30 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
        clear_bit(MMF_HAS_UPROBES, &mm->flags);
 }
+static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+        struct page *page;
+        uprobe_opcode_t opcode;
+        int result;
+        pagefault_disable();
+        result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+                                                        sizeof(opcode));
+        pagefault_enable();
+        if (likely(result == 0))
+                goto out;
+        result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+        if (result < 0)
+                return result;
+        copy_opcode(page, vaddr, &opcode);
+        put_page(page);
+ out:
+        return is_swbp_insn(&opcode);
+}
 static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
 {
        struct mm_struct *mm = current->mm;
@@ -1485,38 +1471,41 @@ static void handle_swbp(struct pt_regs *regs)
                }
                return;
        }
+        /*
+         * TODO: move copy_insn/etc into _register and remove this hack.
+         * After we hit the bp, _unregister + _register can install the
+         * new and not-yet-analyzed uprobe at the same address, restart.
+         */
+        smp_rmb(); /* pairs with wmb() in install_breakpoint() */
+        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
+                goto restart;
        utask = current->utask;
        if (!utask) {
                utask = add_utask();
                /* Cannot allocate; re-execute the instruction. */
                if (!utask)
-                        goto cleanup_ret;
+                        goto restart;
        }
-        utask->active_uprobe = uprobe;
        handler_chain(uprobe, regs);
-        if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
+        if (can_skip_sstep(uprobe, regs))
-                goto cleanup_ret;
+                goto out;
-        utask->state = UTASK_SSTEP;
        if (!pre_ssout(uprobe, regs, bp_vaddr)) {
                arch_uprobe_enable_step(&uprobe->arch);
+                utask->active_uprobe = uprobe;
+                utask->state = UTASK_SSTEP;
                return;
        }
-cleanup_ret:
+restart:
-        if (utask) {
+        /*
-                utask->active_uprobe = NULL;
+         * cannot singlestep; cannot skip instruction;
-                utask->state = UTASK_RUNNING;
+         * re-execute the instruction.
-        }
+         */
-        if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
+        instruction_pointer_set(regs, bp_vaddr);
+out:
-                /*
-                 * cannot singlestep; cannot skip instruction;
-                 * re-execute the instruction.
-                 */
-                instruction_pointer_set(regs, bp_vaddr);
        put_uprobe(uprobe);
 }
@@ -1548,13 +1537,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
 }
 /*
- * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag.  (and on
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
- * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
+ * allows the thread to return from interrupt. After that handle_swbp()
- * allows the thread to return from interrupt.
+ * sets utask->active_uprobe.
 *
- * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
- * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
+ * and allows the thread to return from interrupt.
- * interrupt.
 *
 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
 * uprobe_notify_resume().
@@ -1563,11 +1551,13 @@ void uprobe_notify_resume(struct pt_regs *regs)
 {
        struct uprobe_task *utask;
+        clear_thread_flag(TIF_UPROBE);
        utask = current->utask;
-        if (!utask || utask->state == UTASK_BP_HIT)
+        if (utask && utask->active_uprobe)
-                handle_swbp(regs);
-        else
                handle_singlestep(utask, regs);
+        else
+                handle_swbp(regs);
 }
 /*
@@ -1576,17 +1566,10 @@ void uprobe_notify_resume(struct pt_regs *regs)
 */
 int uprobe_pre_sstep_notifier(struct pt_regs *regs)
 {
-        struct uprobe_task *utask;
        if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
                return 0;
-        utask = current->utask;
-        if (utask)
-                utask->state = UTASK_BP_HIT;
        set_thread_flag(TIF_UPROBE);
        return 1;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index f65345f9e5bb..346616c0092c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -457,108 +457,13 @@ void daemonize(const char *name, ...)
        /* Become as one with the init task */
        daemonize_fs_struct();
-        exit_files(current);
+        daemonize_descriptors();
-        current->files = init_task.files;
-        atomic_inc(&current->files->count);
        reparent_to_kthreadd();
 }
 EXPORT_SYMBOL(daemonize);
-static void close_files(struct files_struct * files)
-{
-        int i, j;
-        struct fdtable *fdt;
-        j = 0;
-        /*
-         * It is safe to dereference the fd table without RCU or
-         * ->file_lock because this is the last reference to the
-         * files structure.  But use RCU to shut RCU-lockdep up.
-         */
-        rcu_read_lock();
-        fdt = files_fdtable(files);
-        rcu_read_unlock();
-        for (;;) {
-                unsigned long set;
-                i = j * BITS_PER_LONG;
-                if (i >= fdt->max_fds)
-                        break;
-                set = fdt->open_fds[j++];
-                while (set) {
-                        if (set & 1) {
-                                struct file * file = xchg(&fdt->fd[i], NULL);
-                                if (file) {
-                                        filp_close(file, files);
-                                        cond_resched();
-                                }
-                        }
-                        i++;
-                        set >>= 1;
-                }
-        }
-}
-struct files_struct *get_files_struct(struct task_struct *task)
-{
-        struct files_struct *files;
-        task_lock(task);
-        files = task->files;
-        if (files)
-                atomic_inc(&files->count);
-        task_unlock(task);
-        return files;
-}
-void put_files_struct(struct files_struct *files)
-{
-        struct fdtable *fdt;
-        if (atomic_dec_and_test(&files->count)) {
-                close_files(files);
-                /*
-                 * Free the fd and fdset arrays if we expanded them.
-                 * If the fdtable was embedded, pass files for freeing
-                 * at the end of the RCU grace period. Otherwise,
-                 * you can free files immediately.
-                 */
-                rcu_read_lock();
-                fdt = files_fdtable(files);
-                if (fdt != &files->fdtab)
-                        kmem_cache_free(files_cachep, files);
-                free_fdtable(fdt);
-                rcu_read_unlock();
-        }
-}
-void reset_files_struct(struct files_struct *files)
-{
-        struct task_struct *tsk = current;
-        struct files_struct *old;
-        old = tsk->files;
-        task_lock(tsk);
-        tsk->files = files;
-        task_unlock(tsk);
-        put_files_struct(old);
-}
-void exit_files(struct task_struct *tsk)
-{
-        struct files_struct * files = tsk->files;
-        if (files) {
-                task_lock(tsk);
-                tsk->files = NULL;
-                task_unlock(tsk);
-                put_files_struct(files);
-        }
-}
 #ifdef CONFIG_MM_OWNER
 /*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
@@ -1046,6 +951,9 @@ void do_exit(long code)
        if (tsk->splice_pipe)
                __free_pipe_info(tsk->splice_pipe);
+        if (tsk->task_frag.page)
+                put_page(tsk->task_frag.page);
        validate_creds_for_do_exit(tsk);
        preempt_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index 5a0e74d89a5a..8b20ab7d3aa2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->btrace_seq = 0;
 #endif
        tsk->splice_pipe = NULL;
+        tsk->task_frag.page = NULL;
        account_kernel_stack(ti, 1);
@@ -422,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                mapping->i_mmap_writable++;
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
-                        vma_prio_tree_add(tmp, mpnt);
+                        if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+                                vma_nonlinear_insert(tmp,
+                                                &mapping->i_mmap_nonlinear);
+                        else
+                                vma_interval_tree_insert_after(tmp, mpnt,
+                                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        mutex_unlock(&mapping->i_mmap_mutex);
                }
@@ -621,26 +627,6 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
-        mm->num_exe_file_vmas++;
-}
-void removed_exe_file_vma(struct mm_struct *mm)
-{
-        mm->num_exe_file_vmas--;
-        if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
-                fput(mm->exe_file);
-                mm->exe_file = NULL;
-        }
-}
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
        if (new_exe_file)
@@ -648,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
        if (mm->exe_file)
                fput(mm->exe_file);
        mm->exe_file = new_exe_file;
-        mm->num_exe_file_vmas = 0;
 }
 struct file *get_mm_exe_file(struct mm_struct *mm)
 {
        struct file *exe_file;
-        /* We need mmap_sem to protect against races with removal of
+        /* We need mmap_sem to protect against races with removal of exe_file */
-         * VM_EXECUTABLE vmas */
        down_read(&mm->mmap_sem);
        exe_file = mm->exe_file;
        if (exe_file)
@@ -1077,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        init_rwsem(&sig->group_rwsem);
 #endif
-        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1601,7 +1584,7 @@ long do_fork(unsigned long clone_flags,
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
-        if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+        if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1651,6 +1634,17 @@ long do_fork(unsigned long clone_flags,
        return nr;
 }
+#ifdef CONFIG_GENERIC_KERNEL_THREAD
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+        return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL,
+                (unsigned long)arg, NULL, NULL);
+}
+#endif
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 49a77727db42..4e69e24d3d7d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
 * @host_data: Controller private data pointer
 *
 * Allocates a legacy irq_domain if irq_base is positive or a linear
- * domain otherwise.
+ * domain otherwise. For the legacy domain, IRQ descriptors will also
+ * be allocated.
 *
 * This is intended to implement the expected behaviour for most
 * interrupt controllers which is that a linear mapping should
@@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
                                         const struct irq_domain_ops *ops,
                                         void *host_data)
 {
-        if (first_irq > 0)
+        if (first_irq > 0) {
-                return irq_domain_add_legacy(of_node, size, first_irq, 0,
+                int irq_base;
+                if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
+                        /*
+                         * Set the descriptor allocator to search for a
+                         * 1-to-1 mapping, such as irq_alloc_desc_at().
+                         * Use of_node_to_nid() which is defined to
+                         * numa_node_id() on platforms that have no custom
+                         * implementation.
+                         */
+                        irq_base = irq_alloc_descs(first_irq, first_irq, size,
+                                                   of_node_to_nid(of_node));
+                        if (irq_base < 0) {
+                                WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+                                     first_irq);
+                                irq_base = first_irq;
+                        }
+                } else
+                        irq_base = first_irq;
+                return irq_domain_add_legacy(of_node, size, irq_base, 0,
                                             ops, host_data);
-        else
+        }
-                return irq_domain_add_linear(of_node, size, ops, host_data);
+        /* A linear domain is the default */
+        return irq_domain_add_linear(of_node, size, ops, host_data);
 }
 /**
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 43049192b5ec..60f48fa0fd0d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key,
        key->timeout = rl;
        INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
 }
+EXPORT_SYMBOL_GPL(jump_label_rate_limit);
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0668d58d6413..5e4bd7864c5d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,6 @@
 #include <linux/hardirq.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
-#include <generated/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
 #include <linux/suspend.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6f99aead66c6..1c317e386831 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,7 @@
 #include <linux/notifier.h>
 #include <linux/suspend.h>
 #include <linux/rwsem.h>
+#include <linux/ptrace.h>
 #include <asm/uaccess.h>
 #include <trace/events/module.h>
@@ -221,11 +222,13 @@ static int ____call_usermodehelper(void *data)
        retval = kernel_execve(sub_info->path,
                               (const char *const *)sub_info->argv,
                               (const char *const *)sub_info->envp);
+        if (!retval)
+                return 0;
        /* Exec failed? */
 fail:
        sub_info->retval = retval;
-        return 0;
+        do_exit(0);
 }
 static int call_helper(void *data)
@@ -292,7 +295,7 @@ static int wait_for_helper(void *data)
        }
        umh_complete(sub_info);
-        return 0;
+        do_exit(0);
 }
 /* This is run by khelper thread  */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 146a6fa96825..29fb60caecb5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,6 +16,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/freezer.h>
+#include <linux/ptrace.h>
 #include <trace/events/sched.h>
 static DEFINE_SPINLOCK(kthread_create_lock);
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
new file mode 100644
index 000000000000..4646eb2c3820
--- /dev/null
+++ b/kernel/modsign_pubkey.c
@@ -0,0 +1,113 @@
+/* Public keys for module signature verification
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include "module-internal.h"
+struct key *modsign_keyring;
+extern __initdata const u8 modsign_certificate_list[];
+extern __initdata const u8 modsign_certificate_list_end[];
+asm(".section .init.data,\"aw\"\n"
+    "modsign_certificate_list:\n"
+    ".incbin \"signing_key.x509\"\n"
+    ".incbin \"extra_certificates\"\n"
+    "modsign_certificate_list_end:"
+    );
+/*
+ * We need to make sure ccache doesn't cache the .o file as it doesn't notice
+ * if modsign.pub changes.
+ */
+static __initdata const char annoy_ccache[] = __TIME__ "foo";
+/*
+ * Load the compiled-in keys
+ */
+static __init int module_verify_init(void)
+{
+        pr_notice("Initialise module verification\n");
+        modsign_keyring = key_alloc(&key_type_keyring, ".module_sign",
+                                    KUIDT_INIT(0), KGIDT_INIT(0),
+                                    current_cred(),
+                                    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                    KEY_USR_VIEW | KEY_USR_READ,
+                                    KEY_ALLOC_NOT_IN_QUOTA);
+        if (IS_ERR(modsign_keyring))
+                panic("Can't allocate module signing keyring\n");
+        if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
+                panic("Can't instantiate module signing keyring\n");
+        return 0;
+}
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(module_verify_init);
+/*
+ * Load the compiled-in keys
+ */
+static __init int load_module_signing_keys(void)
+{
+        key_ref_t key;
+        const u8 *p, *end;
+        size_t plen;
+        pr_notice("Loading module verification certificates\n");
+        end = modsign_certificate_list_end;
+        p = modsign_certificate_list;
+        while (p < end) {
+                /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+                 * than 256 bytes in size.
+                 */
+                if (end - p < 4)
+                        goto dodgy_cert;
+                if (p[0] != 0x30 &&
+                    p[1] != 0x82)
+                        goto dodgy_cert;
+                plen = (p[2] << 8) | p[3];
+                plen += 4;
+                if (plen > end - p)
+                        goto dodgy_cert;
+                key = key_create_or_update(make_key_ref(modsign_keyring, 1),
+                                           "asymmetric",
+                                           NULL,
+                                           p,
+                                           plen,
+                                           (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                           KEY_USR_VIEW,
+                                           KEY_ALLOC_NOT_IN_QUOTA);
+                if (IS_ERR(key))
+                        pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
+                               PTR_ERR(key));
+                else
+                        pr_notice("MODSIGN: Loaded cert '%s'\n",
+                                  key_ref_to_ptr(key)->description);
+                p += plen;
+        }
+        return 0;
+dodgy_cert:
+        pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
+        return 0;
+}
+late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
new file mode 100644
index 000000000000..24f9247b7d02
--- /dev/null
+++ b/kernel/module-internal.h
@@ -0,0 +1,14 @@
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+extern struct key *modsign_keyring;
+extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index 4edbd9c11aca..6085f5ef88ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -58,6 +58,8 @@
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
+#include <linux/fips.h>
+#include "module-internal.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -102,6 +104,43 @@ static LIST_HEAD(modules);
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
+#ifdef CONFIG_MODULE_SIG
+#ifdef CONFIG_MODULE_SIG_FORCE
+static bool sig_enforce = true;
+#else
+static bool sig_enforce = false;
+static int param_set_bool_enable_only(const char *val,
+                                      const struct kernel_param *kp)
+{
+        int err;
+        bool test;
+        struct kernel_param dummy_kp = *kp;
+        dummy_kp.arg = &test;
+        err = param_set_bool(val, &dummy_kp);
+        if (err)
+                return err;
+        /* Don't let them unset it once it's set! */
+        if (!test && sig_enforce)
+                return -EROFS;
+        if (test)
+                sig_enforce = true;
+        return 0;
+}
+static const struct kernel_param_ops param_ops_bool_enable_only = {
+        .set = param_set_bool_enable_only,
+        .get = param_get_bool,
+};
+#define param_check_bool_enable_only param_check_bool
+module_param(sig_enforce, bool_enable_only, 0644);
+#endif /* !CONFIG_MODULE_SIG_FORCE */
+#endif /* CONFIG_MODULE_SIG */
 /* Block module loading/unloading? */
 int modules_disabled = 0;
@@ -136,6 +175,7 @@ struct load_info {
        unsigned long symoffs, stroffs;
        struct _ddebug *debug;
        unsigned int num_debug;
+        bool sig_ok;
        struct {
                unsigned int sym, str, mod, vers, info, pcpu;
        } index;
@@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
        return ret;
 }
-int __weak apply_relocate(Elf_Shdr *sechdrs,
-                          const char *strtab,
-                          unsigned int symindex,
-                          unsigned int relsec,
-                          struct module *me)
-{
-        pr_err("module %s: REL relocation unsupported\n", me->name);
-        return -ENOEXEC;
-}
-int __weak apply_relocate_add(Elf_Shdr *sechdrs,
-                              const char *strtab,
-                              unsigned int symindex,
-                              unsigned int relsec,
-                              struct module *me)
-{
-        pr_err("module %s: RELA relocation unsupported\n", me->name);
-        return -ENOEXEC;
-}
 static int apply_relocations(struct module *mod, const struct load_info *info)
 {
        unsigned int i;
@@ -2399,7 +2419,44 @@ static inline void kmemleak_load_module(const struct module *mod,
 }
 #endif
-/* Sets info->hdr and info->len. */
+#ifdef CONFIG_MODULE_SIG
+static int module_sig_check(struct load_info *info,
+                            const void *mod, unsigned long *_len)
+{
+        int err = -ENOKEY;
+        unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+        unsigned long len = *_len;
+        if (len > markerlen &&
+            memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+                /* We truncate the module to discard the signature */
+                *_len -= markerlen;
+                err = mod_verify_sig(mod, _len);
+        }
+        if (!err) {
+                info->sig_ok = true;
+                return 0;
+        }
+        /* Not having a signature is only an error if we're strict. */
+        if (err < 0 && fips_enabled)
+                panic("Module verification failed with error %d in FIPS mode\n",
+                      err);
+        if (err == -ENOKEY && !sig_enforce)
+                err = 0;
+        return err;
+}
+#else /* !CONFIG_MODULE_SIG */
+static int module_sig_check(struct load_info *info,
+                            void *mod, unsigned long *len)
+{
+        return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+/* Sets info->hdr, info->len and info->sig_ok. */
 static int copy_and_check(struct load_info *info,
                          const void __user *umod, unsigned long len,
                          const char __user *uargs)
@@ -2419,6 +2476,10 @@ static int copy_and_check(struct load_info *info,
                goto free_hdr;
        }
+        err = module_sig_check(info, hdr, &len);
+        if (err)
+                goto free_hdr;
        /* Sanity checks against insmoding binaries or wrong arch,
           weird elf version */
        if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
@@ -2730,6 +2791,10 @@ static int check_module_license_and_versions(struct module *mod)
        if (strcmp(mod->name, "driverloader") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+        /* lve claims to be GPL but upstream won't provide source */
+        if (strcmp(mod->name, "lve") == 0)
+                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs)
            || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2861,6 +2926,20 @@ static int post_relocation(struct module *mod, const struct load_info *info)
        return module_finalize(info->hdr, info->sechdrs, mod);
 }
+/* Is this module of this name done loading?  No locks held. */
+static bool finished_loading(const char *name)
+{
+        struct module *mod;
+        bool ret;
+        mutex_lock(&module_mutex);
+        mod = find_module(name);
+        ret = !mod || mod->state != MODULE_STATE_COMING;
+        mutex_unlock(&module_mutex);
+        return ret;
+}
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
 static struct module *load_module(void __user *umod,
@@ -2868,7 +2947,7 @@ static struct module *load_module(void __user *umod,
                                  const char __user *uargs)
 {
        struct load_info info = { NULL, };
-        struct module *mod;
+        struct module *mod, *old;
        long err;
        pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2886,6 +2965,12 @@ static struct module *load_module(void __user *umod,
                goto free_copy;
        }
+#ifdef CONFIG_MODULE_SIG
+        mod->sig_ok = info.sig_ok;
+        if (!mod->sig_ok)
+                add_taint_module(mod, TAINT_FORCED_MODULE);
+#endif
        /* Now module is in final location, initialize linked lists, etc. */
        err = module_unload_init(mod);
        if (err)
@@ -2934,8 +3019,18 @@ static struct module *load_module(void __user *umod,
         * function to insert in a way safe to concurrent readers.
         * The mutex protects against concurrent writers.
         */
+again:
        mutex_lock(&module_mutex);
-        if (find_module(mod->name)) {
+        if ((old = find_module(mod->name)) != NULL) {
+                if (old->state == MODULE_STATE_COMING) {
+                        /* Wait in case it fails to load. */
+                        mutex_unlock(&module_mutex);
+                        err = wait_event_interruptible(module_wq,
+                                               finished_loading(mod->name));
+                        if (err)
+                                goto free_arch_cleanup;
+                        goto again;
+                }
                err = -EEXIST;
                goto unlock;
        }
@@ -2975,7 +3070,7 @@ static struct module *load_module(void __user *umod,
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
        module_bug_cleanup(mod);
+        wake_up_all(&module_wq);
 ddebug:
        dynamic_debug_remove(info.debug);
 unlock:
@@ -3050,7 +3145,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
                blocking_notifier_call_chain(&module_notify_list,
                                             MODULE_STATE_GOING, mod);
                free_module(mod);
-                wake_up(&module_wq);
+                wake_up_all(&module_wq);
                return ret;
        }
        if (ret > 0) {
@@ -3062,9 +3157,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
                dump_stack();
        }
-        /* Now it's a first class citizen!  Wake up anyone waiting for it. */
+        /* Now it's a first class citizen! */
        mod->state = MODULE_STATE_LIVE;
-        wake_up(&module_wq);
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_LIVE, mod);
@@ -3087,6 +3181,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        mod->init_ro_size = 0;
        mod->init_text_size = 0;
        mutex_unlock(&module_mutex);
+        wake_up_all(&module_wq);
        return 0;
 }
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
new file mode 100644
index 000000000000..d492a23df99c
--- /dev/null
+++ b/kernel/module_signing.c
@@ -0,0 +1,249 @@
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <crypto/public_key.h>
+#include <crypto/hash.h>
+#include <keys/asymmetric-type.h>
+#include "module-internal.h"
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ *      - Signer's name
+ *      - Key identifier
+ *      - Signature data
+ *      - Information block
+ */
+struct module_signature {
+        enum pkey_algo          algo : 8;       /* Public-key crypto algorithm */
+        enum pkey_hash_algo     hash : 8;       /* Digest algorithm */
+        enum pkey_id_type       id_type : 8;    /* Key identifier type */
+        u8                      signer_len;     /* Length of signer's name */
+        u8                      key_id_len;     /* Length of key identifier */
+        u8                      __pad[3];
+        __be32                  sig_len;        /* Length of signature data */
+};
+/*
+ * Digest the module contents.
+ */
+static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
+                                                    const void *mod,
+                                                    unsigned long modlen)
+{
+        struct public_key_signature *pks;
+        struct crypto_shash *tfm;
+        struct shash_desc *desc;
+        size_t digest_size, desc_size;
+        int ret;
+        pr_devel("==>%s()\n", __func__);
+        
+        /* Allocate the hashing algorithm we're going to need and find out how
+         * big the hash operational data will be.
+         */
+        tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
+        if (IS_ERR(tfm))
+                return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
+        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+        digest_size = crypto_shash_digestsize(tfm);
+        /* We allocate the hash operational data storage on the end of our
+         * context data and the digest output buffer on the end of that.
+         */
+        ret = -ENOMEM;
+        pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+        if (!pks)
+                goto error_no_pks;
+        pks->pkey_hash_algo     = hash;
+        pks->digest             = (u8 *)pks + sizeof(*pks) + desc_size;
+        pks->digest_size        = digest_size;
+        desc = (void *)pks + sizeof(*pks);
+        desc->tfm   = tfm;
+        desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = crypto_shash_init(desc);
+        if (ret < 0)
+                goto error;
+        ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
+        if (ret < 0)
+                goto error;
+        crypto_free_shash(tfm);
+        pr_devel("<==%s() = ok\n", __func__);
+        return pks;
+error:
+        kfree(pks);
+error_no_pks:
+        crypto_free_shash(tfm);
+        pr_devel("<==%s() = %d\n", __func__, ret);
+        return ERR_PTR(ret);
+}
+/*
+ * Extract an MPI array from the signature data.  This represents the actual
+ * signature.  Each raw MPI is prefaced by a BE 2-byte value indicating the
+ * size of the MPI in bytes.
+ *
+ * RSA signatures only have one MPI, so currently we only read one.
+ */
+static int mod_extract_mpi_array(struct public_key_signature *pks,
+                                 const void *data, size_t len)
+{
+        size_t nbytes;
+        MPI mpi;
+        if (len < 3)
+                return -EBADMSG;
+        nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
+        data += 2;
+        len -= 2;
+        if (len != nbytes)
+                return -EBADMSG;
+        mpi = mpi_read_raw_data(data, nbytes);
+        if (!mpi)
+                return -ENOMEM;
+        pks->mpi[0] = mpi;
+        pks->nr_mpi = 1;
+        return 0;
+}
+/*
+ * Request an asymmetric key.
+ */
+static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
+                                          const u8 *key_id, size_t key_id_len)
+{
+        key_ref_t key;
+        size_t i;
+        char *id, *q;
+        pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
+        /* Construct an identifier. */
+        id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
+        if (!id)
+                return ERR_PTR(-ENOKEY);
+        memcpy(id, signer, signer_len);
+        q = id + signer_len;
+        *q++ = ':';
+        *q++ = ' ';
+        for (i = 0; i < key_id_len; i++) {
+                *q++ = hex_asc[*key_id >> 4];
+                *q++ = hex_asc[*key_id++ & 0x0f];
+        }
+        *q = 0;
+        pr_debug("Look up: \"%s\"\n", id);
+        key = keyring_search(make_key_ref(modsign_keyring, 1),
+                             &key_type_asymmetric, id);
+        if (IS_ERR(key))
+                pr_warn("Request for unknown module key '%s' err %ld\n",
+                        id, PTR_ERR(key));
+        kfree(id);
+        if (IS_ERR(key)) {
+                switch (PTR_ERR(key)) {
+                        /* Hide some search errors */
+                case -EACCES:
+                case -ENOTDIR:
+                case -EAGAIN:
+                        return ERR_PTR(-ENOKEY);
+                default:
+                        return ERR_CAST(key);
+                }
+        }
+        pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
+        return key_ref_to_ptr(key);
+}
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, unsigned long *_modlen)
+{
+        struct public_key_signature *pks;
+        struct module_signature ms;
+        struct key *key;
+        const void *sig;
+        size_t modlen = *_modlen, sig_len;
+        int ret;
+        pr_devel("==>%s(,%lu)\n", __func__, modlen);
+        if (modlen <= sizeof(ms))
+                return -EBADMSG;
+        memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+        modlen -= sizeof(ms);
+        sig_len = be32_to_cpu(ms.sig_len);
+        if (sig_len >= modlen)
+                return -EBADMSG;
+        modlen -= sig_len;
+        if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
+                return -EBADMSG;
+        modlen -= (size_t)ms.signer_len + ms.key_id_len;
+        *_modlen = modlen;
+        sig = mod + modlen;
+        /* For the moment, only support RSA and X.509 identifiers */
+        if (ms.algo != PKEY_ALGO_RSA ||
+            ms.id_type != PKEY_ID_X509)
+                return -ENOPKG;
+        if (ms.hash >= PKEY_HASH__LAST ||
+            !pkey_hash_algo[ms.hash])
+                return -ENOPKG;
+        key = request_asymmetric_key(sig, ms.signer_len,
+                                     sig + ms.signer_len, ms.key_id_len);
+        if (IS_ERR(key))
+                return PTR_ERR(key);
+        pks = mod_make_digest(ms.hash, mod, modlen);
+        if (IS_ERR(pks)) {
+                ret = PTR_ERR(pks);
+                goto error_put_key;
+        }
+        ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
+                                    sig_len);
+        if (ret < 0)
+                goto error_free_pks;
+        ret = verify_signature(key, pks);
+        pr_devel("verify_signature() = %d\n", ret);
+error_free_pks:
+        mpi_free(pks->rsa.s);
+        kfree(pks);
+error_put_key:
+        key_put(key);
+        pr_devel("<==%s() = %d\n", __func__, ret);
+        return ret;     
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index e86b291ad834..aebd4f5aaf41 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -479,6 +479,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
        }
        return nr;
 }
+EXPORT_SYMBOL_GPL(pid_nr_ns);
 pid_t pid_vnr(struct pid *pid)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6144bab8fd8e..eb00be205811 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/reboot.h>
+#include <linux/export.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -132,18 +133,26 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
        return create_pid_namespace(old_ns);
 }
-void free_pid_ns(struct kref *kref)
+static void free_pid_ns(struct kref *kref)
 {
-        struct pid_namespace *ns, *parent;
+        struct pid_namespace *ns;
        ns = container_of(kref, struct pid_namespace, kref);
-        parent = ns->parent;
        destroy_pid_namespace(ns);
+}
-        if (parent != NULL)
+void put_pid_ns(struct pid_namespace *ns)
-                put_pid_ns(parent);
+{
+        struct pid_namespace *parent;
+        while (ns != &init_pid_ns) {
+                parent = ns->parent;
+                if (!kref_put(&ns->kref, free_pid_ns))
+                        break;
+                ns = parent;
+        }
 }
+EXPORT_SYMBOL_GPL(put_pid_ns);
 void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 {
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a70518c9d82f..5dfdc9ea180b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS
        bool
        depends on PM
+config PM_GENERIC_DOMAINS_SLEEP
+        def_bool y
+        depends on PM_SLEEP && PM_GENERIC_DOMAINS
 config PM_GENERIC_DOMAINS_RUNTIME
        def_bool y
        depends on PM_RUNTIME && PM_GENERIC_DOMAINS
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index d52359374e85..68197a4e8fc9 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -37,7 +37,7 @@ static struct sysrq_key_op	sysrq_poweroff_op = {
        .enable_mask    = SYSRQ_ENABLE_BOOT,
 };
-static int pm_sysrq_init(void)
+static int __init pm_sysrq_init(void)
 {
        register_sysrq_key('o', &sysrq_poweroff_op);
        return 0;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 19db29f67558..87da817f9e13 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -79,7 +79,7 @@ static int try_to_freeze_tasks(bool user_only)
                /*
                 * We need to retry, but first give the freezing tasks some
-                 * time to enter the regrigerator.
+                 * time to enter the refrigerator.
                 */
                msleep(10);
        }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 6a031e684026..846bd42c7ed1 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
        default:
                /* runtime check for not using enum */
                BUG();
+                return PM_QOS_DEFAULT_VALUE;
        }
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 66a2ea37b576..2d607f4d1797 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1890,7 +1890,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_ONLINE:
        case CPU_DEAD:
-        case CPU_DYING:
        case CPU_DOWN_FAILED:
        case CPU_UP_CANCELED:
                console_lock();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a232bb59d93f..1f5e55dda955 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
                return has_ns_capability(current, ns, CAP_SYS_PTRACE);
 }
-int __ptrace_may_access(struct task_struct *task, unsigned int mode)
+/* Returns 0 on success, -errno on denial. */
+static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
        const struct cred *cred = current_cred(), *tcred;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4fb2376ddf06..74df86bd9204 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
        .orphan_nxttail = &sname##_state.orphan_nxtlist, \
        .orphan_donetail = &sname##_state.orphan_donelist, \
        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+        .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
        .name = #sname, \
 }
@@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
        raw_spin_unlock_irq(&rnp->lock);
        /* Exclude any concurrent CPU-hotplug operations. */
-        get_online_cpus();
+        mutex_lock(&rsp->onoff_mutex);
        /*
         * Set the quiescent-state-needed bits in all the rcu_node
@@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                cond_resched();
        }
-        put_online_cpus();
+        mutex_unlock(&rsp->onoff_mutex);
        return 1;
 }
@@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
        /* Exclude any attempts to start a new grace period. */
+        mutex_lock(&rsp->onoff_mutex);
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
@@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        init_callback_list(rdp);
        /* Disallow further callbacks on this CPU. */
        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+        mutex_unlock(&rsp->onoff_mutex);
 }
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
+        /* Exclude new grace periods. */
+        mutex_lock(&rsp->onoff_mutex);
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->beenonline = 1;     /* We have now been online. */
@@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        rcu_prepare_for_idle_init(cpu);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
-        /*
-         * A new grace period might start here.  If so, we won't be part
-         * of it, but that is OK, as we are currently in a quiescent state.
-         */
-        /* Exclude any attempts to start a new GP on large systems. */
-        raw_spin_lock(&rsp->onofflock);         /* irqs already disabled. */
        /* Add CPU to rcu_node bitmasks. */
        rnp = rdp->mynode;
        mask = rdp->grpmask;
@@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
        } while (rnp != NULL && !(rnp->qsmaskinit & mask));
+        local_irq_restore(flags);
-        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+        mutex_unlock(&rsp->onoff_mutex);
 }
 static void __cpuinit rcu_prepare_cpu(int cpu)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5faf05d68326..a240f032848e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -394,11 +394,17 @@ struct rcu_state {
        struct rcu_head **orphan_donetail;      /* Tail of above. */
        long qlen_lazy;                         /* Number of lazy callbacks. */
        long qlen;                              /* Total number of callbacks. */
+        /* End of fields guarded by onofflock. */
+        struct mutex onoff_mutex;               /* Coordinate hotplug & GPs. */
        struct mutex barrier_mutex;             /* Guards barrier fields. */
        atomic_t barrier_cpu_count;             /* # CPUs waiting on. */
        struct completion barrier_completion;   /* Wake at barrier end. */
        unsigned long n_barrier_done;           /* ++ at start and end of */
                                                /*  _rcu_barrier(). */
+        /* End of fields guarded by barrier_mutex. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
                                                /*  force_quiescent_state(). */
        unsigned long n_force_qs;               /* Number of calls to */
diff --git a/kernel/resource.c b/kernel/resource.c
index 34d45886ee84..73f35d4b30b9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root,
        struct resource *parent = root;
        struct resource *conflict;
        struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+        struct resource *next_res = NULL;
        if (!res)
                return;
@@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root,
        res->end = end;
        res->flags = IORESOURCE_BUSY;
-        conflict = __request_resource(parent, res);
+        while (1) {
-        if (!conflict)
-                return;
-        /* failed, split and try again */
+                conflict = __request_resource(parent, res);
-        kfree(res);
+                if (!conflict) {
+                        if (!next_res)
+                                break;
+                        res = next_res;
+                        next_res = NULL;
+                        continue;
+                }
-        /* conflict covered whole area */
+                /* conflict covered whole area */
-        if (conflict->start <= start && conflict->end >= end)
+                if (conflict->start <= res->start &&
-                return;
+                                conflict->end >= res->end) {
+                        kfree(res);
+                        WARN_ON(next_res);
+                        break;
+                }
+                /* failed, split and try again */
+                if (conflict->start > res->start) {
+                        end = res->end;
+                        res->end = conflict->start - 1;
+                        if (conflict->end < end) {
+                                next_res = kzalloc(sizeof(*next_res),
+                                                GFP_ATOMIC);
+                                if (!next_res) {
+                                        kfree(res);
+                                        break;
+                                }
+                                next_res->name = name;
+                                next_res->start = conflict->end + 1;
+                                next_res->end = end;
+                                next_res->flags = IORESOURCE_BUSY;
+                        }
+                } else {
+                        res->start = conflict->end + 1;
+                }
+        }
-        if (conflict->start > start)
-                __reserve_region_with_split(root, start, conflict->start-1, name);
-        if (conflict->end < end)
-                __reserve_region_with_split(root, conflict->end+1, end, name);
 }
 void __init reserve_region_with_split(struct resource *root,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c17747236438..2d8927fda712 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -505,7 +505,7 @@ static inline void init_hrtick(void)
 #ifdef CONFIG_SMP
 #ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#define tsk_is_polling(t) 0
 #endif
 void resched_task(struct task_struct *p)
@@ -6122,6 +6122,17 @@ static void sched_init_numa(void)
         * numbers.
         */
+        /*
+         * Here, we should temporarily reset sched_domains_numa_levels to 0.
+         * If it fails to allocate memory for array sched_domains_numa_masks[][],
+         * the array will contain less then 'level' members. This could be
+         * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+         * in other functions.
+         *
+         * We reset it to 'level' at the end of this function.
+         */
+        sched_domains_numa_levels = 0;
        sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
        if (!sched_domains_numa_masks)
                return;
@@ -6176,11 +6187,68 @@ static void sched_init_numa(void)
        }
        sched_domain_topology = tl;
+        sched_domains_numa_levels = level;
+}
+static void sched_domains_numa_masks_set(int cpu)
+{
+        int i, j;
+        int node = cpu_to_node(cpu);
+        for (i = 0; i < sched_domains_numa_levels; i++) {
+                for (j = 0; j < nr_node_ids; j++) {
+                        if (node_distance(j, node) <= sched_domains_numa_distance[i])
+                                cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+                }
+        }
+}
+static void sched_domains_numa_masks_clear(int cpu)
+{
+        int i, j;
+        for (i = 0; i < sched_domains_numa_levels; i++) {
+                for (j = 0; j < nr_node_ids; j++)
+                        cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+        }
+}
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+                                           unsigned long action,
+                                           void *hcpu)
+{
+        int cpu = (long)hcpu;
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_ONLINE:
+                sched_domains_numa_masks_set(cpu);
+                break;
+        case CPU_DEAD:
+                sched_domains_numa_masks_clear(cpu);
+                break;
+        default:
+                return NOTIFY_DONE;
+        }
+        return NOTIFY_OK;
 }
 #else
 static inline void sched_init_numa(void)
 {
 }
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+                                           unsigned long action,
+                                           void *hcpu)
+{
+        return 0;
+}
 #endif /* CONFIG_NUMA */
 static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -6629,6 +6697,7 @@ void __init sched_init_smp(void)
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+        hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
        hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
diff --git a/kernel/signal.c b/kernel/signal.c
index 2c681f11b7d2..0af8868525d6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
+#include <linux/coredump.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
@@ -2359,7 +2360,7 @@ relock:
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
-                        do_coredump(info->si_signo, info->si_signo, regs);
+                        do_coredump(info, regs);
                }
                /*
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2095be3318d5..97c465ebd844 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -379,7 +379,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
        rcu_batch_queue(&sp->batch_queue, head);
        if (!sp->running) {
                sp->running = true;
-                queue_delayed_work(system_nrt_wq, &sp->work, 0);
+                schedule_delayed_work(&sp->work, 0);
        }
        spin_unlock_irqrestore(&sp->queue_lock, flags);
 }
@@ -631,7 +631,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
        }
        if (pending)
-                queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
+                schedule_delayed_work(&sp->work, SRCU_INTERVAL);
 }
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index 241507f23eca..e6e0ece5f6a0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 void kernel_restart(char *cmd)
 {
        kernel_restart_prepare(cmd);
+        disable_nonboot_cpus();
        if (!cmd)
                printk(KERN_EMERG "Restarting system.\n");
        else
@@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem);
 * Work around broken programs that cannot handle "Linux 3.0".
 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
 */
-static int override_release(char __user *release, int len)
+static int override_release(char __user *release, size_t len)
 {
        int ret = 0;
-        char buf[65];
        if (current->personality & UNAME26) {
-                char *rest = UTS_RELEASE;
+                const char *rest = UTS_RELEASE;
+                char buf[65] = { 0 };
                int ndots = 0;
                unsigned v;
+                size_t copy;
                while (*rest) {
                        if (*rest == '.' && ++ndots >= 3)
@@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len)
                        rest++;
                }
                v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
-                snprintf(buf, len, "2.6.%u%s", v, rest);
+                copy = clamp_t(size_t, len, 1, sizeof(buf));
-                ret = copy_to_user(release, buf, len);
+                copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
+                ret = copy_to_user(release, buf, copy + 1);
        }
        return ret;
 }
@@ -1788,15 +1791,15 @@ SYSCALL_DEFINE1(umask, int, mask)
 #ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
-        struct file *exe_file;
+        struct fd exe;
        struct dentry *dentry;
        int err;
-        exe_file = fget(fd);
+        exe = fdget(fd);
-        if (!exe_file)
+        if (!exe.file)
                return -EBADF;
-        dentry = exe_file->f_path.dentry;
+        dentry = exe.file->f_path.dentry;
        /*
         * Because the original mm->exe_file points to executable file, make
@@ -1805,7 +1808,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
         */
        err = -EACCES;
        if (!S_ISREG(dentry->d_inode->i_mode)   ||
-            exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+            exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
        err = inode_permission(dentry->d_inode, MAY_EXEC);
@@ -1839,12 +1842,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
                goto exit_unlock;
        err = 0;
-        set_mm_exe_file(mm, exe_file);
+        set_mm_exe_file(mm, exe.file);  /* this grabs a reference to exe.file */
 exit_unlock:
        up_write(&mm->mmap_sem);
 exit:
-        fput(exe_file);
+        fdput(exe);
        return err;
 }
@@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void)
                return -ENOMEM;
        }
-        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
+        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
                                      NULL, argv_cleanup, NULL);
        if (ret == -ENOMEM)
                argv_free(argv);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 81c7b1a1a307..26f65eaa01f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,10 +97,12 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int max_threads;
-extern int core_uses_pid;
 extern int suid_dumpable;
+#ifdef CONFIG_COREDUMP
+extern int core_uses_pid;
 extern char core_pattern[];
 extern unsigned int core_pipe_limit;
+#endif
 extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
@@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
 static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_COREDUMP
 static int proc_dostring_coredump(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
@@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_COREDUMP
        {
                .procname       = "core_uses_pid",
                .data           = &core_uses_pid,
@@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#endif
 #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
@@ -1543,8 +1549,7 @@ static struct ctl_table fs_table[] = {
 };
 static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
+#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
-    defined(CONFIG_S390) || defined(CONFIG_TILE)
        {
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
@@ -2036,12 +2041,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 static void validate_coredump_safety(void)
 {
+#ifdef CONFIG_COREDUMP
        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
            core_pattern[0] != '/' && core_pattern[0] != '|') {
                printk(KERN_WARNING "Unsafe core_pattern used with "\
                        "suid_dumpable=2. Pipe handler or fully qualified "\
                        "core dump path required.\n");
        }
+#endif
 }
 static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
@@ -2053,6 +2060,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
        return error;
 }
+#ifdef CONFIG_COREDUMP
 static int proc_dostring_coredump(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -2061,6 +2069,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
                validate_coredump_safety();
        return error;
 }
+#endif
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
                                     void __user *buffer,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d0a32796550f..145bb4d3bd4d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,6 +27,7 @@
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/pid_namespace.h>
 #include <net/genetlink.h>
 #include <linux/atomic.h>
@@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb,
        up_write(&listeners->sem);
 }
-static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
+static void fill_stats(struct user_namespace *user_ns,
+                       struct pid_namespace *pid_ns,
+                       struct task_struct *tsk, struct taskstats *stats)
 {
        memset(stats, 0, sizeof(*stats));
        /*
@@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
        stats->version = TASKSTATS_VERSION;
        stats->nvcsw = tsk->nvcsw;
        stats->nivcsw = tsk->nivcsw;
-        bacct_add_tsk(stats, tsk);
+        bacct_add_tsk(user_ns, pid_ns, stats, tsk);
        /* fill in extended acct fields */
        xacct_add_tsk(stats, tsk);
@@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
        rcu_read_unlock();
        if (!tsk)
                return -ESRCH;
-        fill_stats(tsk, stats);
+        fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
        put_task_struct(tsk);
        return 0;
 }
@@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
        if (!cpumask_subset(mask, cpu_possible_mask))
                return -EINVAL;
+        if (current_user_ns() != &init_user_ns)
+                return -EINVAL;
+        if (task_active_pid_ns(current) != &init_pid_ns)
+                return -EINVAL;
        if (isadd == REGISTER) {
                for_each_cpu(cpu, mask) {
                        s = kmalloc_node(sizeof(struct listener),
@@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        struct nlattr *na;
        size_t size;
        u32 fd;
-        struct file *file;
+        struct fd f;
-        int fput_needed;
        na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
        if (!na)
                return -EINVAL;
        fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
-        file = fget_light(fd, &fput_needed);
+        f = fdget(fd);
-        if (!file)
+        if (!f.file)
                return 0;
        size = nla_total_size(sizeof(struct cgroupstats));
@@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
                                sizeof(struct cgroupstats));
        if (na == NULL) {
+                nlmsg_free(rep_skb);
                rc = -EMSGSIZE;
                goto err;
        }
@@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        stats = nla_data(na);
        memset(stats, 0, sizeof(*stats));
-        rc = cgroupstats_build(stats, file->f_dentry);
+        rc = cgroupstats_build(stats, f.file->f_dentry);
        if (rc < 0) {
                nlmsg_free(rep_skb);
                goto err;
@@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        rc = send_reply(rep_skb, info);
 err:
-        fput_light(file, fput_needed);
+        fdput(f);
        return rc;
 }
@@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info)
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
        if (rc < 0)
                goto out;
-        rc = add_del_listener(info->snd_pid, mask, REGISTER);
+        rc = add_del_listener(info->snd_portid, mask, REGISTER);
 out:
        free_cpumask_var(mask);
        return rc;
@@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info)
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
        if (rc < 0)
                goto out;
-        rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+        rc = add_del_listener(info->snd_portid, mask, DEREGISTER);
 out:
        free_cpumask_var(mask);
        return rc;
@@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        if (rc < 0)
                return;
-        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID,
+                         task_pid_nr_ns(tsk, &init_pid_ns));
        if (!stats)
                goto err;
-        fill_stats(tsk, stats);
+        fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
        /*
         * Doesn't matter if tsk is the leader or the last group member leaving
@@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        if (!is_thread_group || !group_dead)
                goto send;
-        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID,
+                         task_tgid_nr_ns(tsk, &init_pid_ns));
        if (!stats)
                goto err;
diff --git a/kernel/time.c b/kernel/time.c
index ba744cf80696..d226c6a3fd28 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -30,7 +30,7 @@
 #include <linux/export.h>
 #include <linux/timex.h>
 #include <linux/capability.h>
-#include <linux/clocksource.h>
+#include <linux/timekeeper_internal.h>
 #include <linux/errno.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fd42bd452b75..8601f0db1261 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA
 config GENERIC_TIME_VSYSCALL
        bool
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL_OLD
+        bool
 # ktime_t scalar 64bit nsec representation
 config KTIME_SCALAR
        bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index aa27d391bfc8..f11d83b12949 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,7 +37,6 @@
 static struct alarm_base {
        spinlock_t              lock;
        struct timerqueue_head  timerqueue;
-        struct hrtimer          timer;
        ktime_t                 (*gettime)(void);
        clockid_t               base_clockid;
 } alarm_bases[ALARM_NUMTYPE];
@@ -46,6 +45,8 @@ static struct alarm_base {
 static ktime_t freezer_delta;
 static DEFINE_SPINLOCK(freezer_delta_lock);
+static struct wakeup_source *ws;
 #ifdef CONFIG_RTC_CLASS
 /* rtc timer and device for setting alarm wakeups at suspend */
 static struct rtc_timer         rtctimer;
@@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }
 * @base: pointer to the base where the timer is being run
 * @alarm: pointer to alarm being enqueued.
 *
- * Adds alarm to a alarm_base timerqueue and if necessary sets
+ * Adds alarm to a alarm_base timerqueue
- * an hrtimer to run.
 *
 * Must hold base->lock when calling.
 */
 static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
 {
+        if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
+                timerqueue_del(&base->timerqueue, &alarm->node);
        timerqueue_add(&base->timerqueue, &alarm->node);
        alarm->state |= ALARMTIMER_STATE_ENQUEUED;
-        if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
-                hrtimer_try_to_cancel(&base->timer);
-                hrtimer_start(&base->timer, alarm->node.expires,
-                                HRTIMER_MODE_ABS);
-        }
 }
 /**
- * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
 * @base: pointer to the base where the timer is running
 * @alarm: pointer to alarm being removed
 *
- * Removes alarm to a alarm_base timerqueue and if necessary sets
+ * Removes alarm to a alarm_base timerqueue
- * a new timer to run.
 *
 * Must hold base->lock when calling.
 */
-static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
 {
-        struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
        if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
                return;
        timerqueue_del(&base->timerqueue, &alarm->node);
        alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-        if (next == &alarm->node) {
-                hrtimer_try_to_cancel(&base->timer);
-                next = timerqueue_getnext(&base->timerqueue);
-                if (!next)
-                        return;
-                hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
-        }
 }
@@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
 */
 static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 {
-        struct alarm_base *base = container_of(timer, struct alarm_base, timer);
+        struct alarm *alarm = container_of(timer, struct alarm, timer);
-        struct timerqueue_node *next;
+        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
-        ktime_t now;
        int ret = HRTIMER_NORESTART;
        int restart = ALARMTIMER_NORESTART;
        spin_lock_irqsave(&base->lock, flags);
-        now = base->gettime();
+        alarmtimer_dequeue(base, alarm);
-        while ((next = timerqueue_getnext(&base->timerqueue))) {
+        spin_unlock_irqrestore(&base->lock, flags);
-                struct alarm *alarm;
-                ktime_t expired = next->expires;
-                if (expired.tv64 > now.tv64)
-                        break;
-                alarm = container_of(next, struct alarm, node);
-                timerqueue_del(&base->timerqueue, &alarm->node);
-                alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-                alarm->state |= ALARMTIMER_STATE_CALLBACK;
-                spin_unlock_irqrestore(&base->lock, flags);
-                if (alarm->function)
-                        restart = alarm->function(alarm, now);
-                spin_lock_irqsave(&base->lock, flags);
-                alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
-                if (restart != ALARMTIMER_NORESTART) {
+        if (alarm->function)
-                        timerqueue_add(&base->timerqueue, &alarm->node);
+                restart = alarm->function(alarm, base->gettime());
-                        alarm->state |= ALARMTIMER_STATE_ENQUEUED;
-                }
-        }
-        if (next) {
+        spin_lock_irqsave(&base->lock, flags);
-                hrtimer_set_expires(&base->timer, next->expires);
+        if (restart != ALARMTIMER_NORESTART) {
+                hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+                alarmtimer_enqueue(base, alarm);
                ret = HRTIMER_RESTART;
        }
        spin_unlock_irqrestore(&base->lock, flags);
@@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)
        unsigned long flags;
        struct rtc_device *rtc;
        int i;
+        int ret;
        spin_lock_irqsave(&freezer_delta_lock, flags);
        min = freezer_delta;
@@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)
        if (min.tv64 == 0)
                return 0;
-        /* XXX - Should we enforce a minimum sleep time? */
+        if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
-        WARN_ON(min.tv64 < NSEC_PER_SEC);
+                __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+                return -EBUSY;
+        }
        /* Setup an rtc timer to fire that far in the future */
        rtc_timer_cancel(rtc, &rtctimer);
@@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)
        now = rtc_tm_to_ktime(tm);
        now = ktime_add(now, min);
-        rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+        /* Set alarm, if in the past reject suspend briefly to handle */
+        ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
-        return 0;
+        if (ret < 0)
+                __pm_wakeup_event(ws, MSEC_PER_SEC);
+        return ret;
 }
 #else
 static int alarmtimer_suspend(struct device *dev)
@@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
                enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
 {
        timerqueue_init(&alarm->node);
+        hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
+                        HRTIMER_MODE_ABS);
+        alarm->timer.function = alarmtimer_fired;
        alarm->function = function;
        alarm->type = type;
        alarm->state = ALARMTIMER_STATE_INACTIVE;
@@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 * @alarm: ptr to alarm to set
 * @start: time to run the alarm
 */
-void alarm_start(struct alarm *alarm, ktime_t start)
+int alarm_start(struct alarm *alarm, ktime_t start)
 {
        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
+        int ret;
        spin_lock_irqsave(&base->lock, flags);
-        if (alarmtimer_active(alarm))
-                alarmtimer_remove(base, alarm);
        alarm->node.expires = start;
        alarmtimer_enqueue(base, alarm);
+        ret = hrtimer_start(&alarm->timer, alarm->node.expires,
+                                HRTIMER_MODE_ABS);
        spin_unlock_irqrestore(&base->lock, flags);
+        return ret;
 }
 /**
@@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)
 {
        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
-        int ret = -1;
+        int ret;
-        spin_lock_irqsave(&base->lock, flags);
-        if (alarmtimer_callback_running(alarm))
-                goto out;
-        if (alarmtimer_is_queued(alarm)) {
+        spin_lock_irqsave(&base->lock, flags);
-                alarmtimer_remove(base, alarm);
+        ret = hrtimer_try_to_cancel(&alarm->timer);
-                ret = 1;
+        if (ret >= 0)
-        } else
+                alarmtimer_dequeue(base, alarm);
-                ret = 0;
-out:
        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
 }
@@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)
        for (i = 0; i < ALARM_NUMTYPE; i++) {
                timerqueue_init_head(&alarm_bases[i].timerqueue);
                spin_lock_init(&alarm_bases[i].lock);
-                hrtimer_init(&alarm_bases[i].timer,
-                                alarm_bases[i].base_clockid,
-                                HRTIMER_MODE_ABS);
-                alarm_bases[i].timer.function = alarmtimer_fired;
        }
        error = alarmtimer_rtc_interface_setup();
@@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)
                error = PTR_ERR(pdev);
                goto out_drv;
        }
+        ws = wakeup_source_register("alarmtimer");
        return 0;
 out_drv:
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 7e1ce012a851..30b6de0d977c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old,
        local_irq_restore(flags);
 }
+/**
+ * clockevents_suspend - suspend clock devices
+ */
+void clockevents_suspend(void)
+{
+        struct clock_event_device *dev;
+        list_for_each_entry_reverse(dev, &clockevent_devices, list)
+                if (dev->suspend)
+                        dev->suspend(dev);
+}
+/**
+ * clockevents_resume - resume clock devices
+ */
+void clockevents_resume(void)
+{
+        struct clock_event_device *dev;
+        list_for_each_entry(dev, &clockevent_devices, list)
+                if (dev->resume)
+                        dev->resume(dev);
+}
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 /**
 * clockevents_notify - notification about relevant events
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 46da0537c10b..6629bf7b5285 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
 * requested HZ value. It is also not recommended
 * for "tick-less" systems.
 */
-#define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ))
+#define NSEC_PER_JIFFY  ((NSEC_PER_SEC+HZ/2)/HZ)
 /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
 * conversion, the .shift value could be zero. However
@@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)
 {
        return &clocksource_jiffies;
 }
+struct clocksource refined_jiffies;
+int register_refined_jiffies(long cycles_per_second)
+{
+        u64 nsec_per_tick, shift_hz;
+        long cycles_per_tick;
+        refined_jiffies = clocksource_jiffies;
+        refined_jiffies.name = "refined-jiffies";
+        refined_jiffies.rating++;
+        /* Calc cycles per tick */
+        cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
+        /* shift_hz stores hz<<8 for extra accuracy */
+        shift_hz = (u64)cycles_per_second << 8;
+        shift_hz += cycles_per_tick/2;
+        do_div(shift_hz, cycles_per_tick);
+        /* Calculate nsec_per_tick using shift_hz */
+        nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+        nsec_per_tick += (u32)shift_hz/2;
+        do_div(nsec_per_tick, (u32)shift_hz);
+        refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
+        clocksource_register(&refined_jiffies);
+        return 0;
+}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f423bdd035c2..a40260885265 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
                 */
                if (ts->tick_stopped) {
                        touch_softlockup_watchdog();
-                        if (idle_cpu(cpu))
+                        if (is_idle_task(current))
                                ts->idle_jiffies++;
                }
                update_process_times(user_mode(regs));
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d3b91e75cecd..e424970bb562 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,6 +8,7 @@
 *
 */
+#include <linux/timekeeper_internal.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
@@ -21,61 +22,6 @@
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
-/* Structure holding internal timekeeping values. */
-struct timekeeper {
-        /* Current clocksource used for timekeeping. */
-        struct clocksource      *clock;
-        /* NTP adjusted clock multiplier */
-        u32                     mult;
-        /* The shift value of the current clocksource. */
-        u32                     shift;
-        /* Number of clock cycles in one NTP interval. */
-        cycle_t                 cycle_interval;
-        /* Number of clock shifted nano seconds in one NTP interval. */
-        u64                     xtime_interval;
-        /* shifted nano seconds left over when rounding cycle_interval */
-        s64                     xtime_remainder;
-        /* Raw nano seconds accumulated per NTP interval. */
-        u32                     raw_interval;
-        /* Current CLOCK_REALTIME time in seconds */
-        u64                     xtime_sec;
-        /* Clock shifted nano seconds */
-        u64                     xtime_nsec;
-        /* Difference between accumulated time and NTP time in ntp
-         * shifted nano seconds. */
-        s64                     ntp_error;
-        /* Shift conversion between clock shifted nano seconds and
-         * ntp shifted nano seconds. */
-        u32                     ntp_error_shift;
-        /*
-         * wall_to_monotonic is what we need to add to xtime (or xtime corrected
-         * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
-         * at zero at system boot time, so wall_to_monotonic will be negative,
-         * however, we will ALWAYS keep the tv_nsec part positive so we can use
-         * the usual normalization.
-         *
-         * wall_to_monotonic is moved after resume from suspend for the
-         * monotonic time not to jump. We need to add total_sleep_time to
-         * wall_to_monotonic to get the real boot based time offset.
-         *
-         * - wall_to_monotonic is no longer the boot time, getboottime must be
-         * used instead.
-         */
-        struct timespec         wall_to_monotonic;
-        /* Offset clock monotonic -> clock realtime */
-        ktime_t                 offs_real;
-        /* time spent in suspend */
-        struct timespec         total_sleep_time;
-        /* Offset clock monotonic -> clock boottime */
-        ktime_t                 offs_boot;
-        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
-        struct timespec         raw_time;
-        /* Seqlock for all timekeeper values */
-        seqlock_t               lock;
-};
 static struct timekeeper timekeeper;
@@ -96,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
        }
 }
-static struct timespec tk_xtime(struct timekeeper *tk)
-{
-        struct timespec ts;
-        ts.tv_sec = tk->xtime_sec;
-        ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
-        return ts;
-}
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
 {
        tk->xtime_sec = ts->tv_sec;
@@ -246,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
-        struct timespec xt;
        if (clearntp) {
                tk->ntp_error = 0;
                ntp_clear();
        }
-        xt = tk_xtime(tk);
+        update_vsyscall(tk);
-        update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
 }
 /**
@@ -776,6 +710,7 @@ static void timekeeping_resume(void)
        read_persistent_clock(&ts);
+        clockevents_resume();
        clocksource_resume();
        write_seqlock_irqsave(&tk->lock, flags);
@@ -835,6 +770,7 @@ static int timekeeping_suspend(void)
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
        clocksource_suspend();
+        clockevents_suspend();
        return 0;
 }
@@ -1111,7 +1047,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        accumulate_nsecs_to_secs(tk);
        /* Accumulate raw time */
-        raw_nsecs = tk->raw_interval << shift;
+        raw_nsecs = (u64)tk->raw_interval << shift;
        raw_nsecs += tk->raw_time.tv_nsec;
        if (raw_nsecs >= NSEC_PER_SEC) {
                u64 raw_secs = raw_nsecs;
@@ -1128,6 +1064,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        return offset;
 }
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+        s64 remainder;
+        /*
+        * Store only full nanoseconds into xtime_nsec after rounding
+        * it up and add the remainder to the error difference.
+        * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+        * by truncating the remainder in vsyscalls. However, it causes
+        * additional work to be done in timekeeping_adjust(). Once
+        * the vsyscall implementations are converted to use xtime_nsec
+        * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+        * users are removed, this can be killed.
+        */
+        remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
+        tk->xtime_nsec -= remainder;
+        tk->xtime_nsec += 1ULL << tk->shift;
+        tk->ntp_error += remainder << tk->ntp_error_shift;
+}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
@@ -1139,7 +1102,6 @@ static void update_wall_time(void)
        cycle_t offset;
        int shift = 0, maxshift;
        unsigned long flags;
-        s64 remainder;
        write_seqlock_irqsave(&tk->lock, flags);
@@ -1181,20 +1143,11 @@ static void update_wall_time(void)
        /* correct the clock when NTP error is too big */
        timekeeping_adjust(tk, offset);
        /*
-        * Store only full nanoseconds into xtime_nsec after rounding
+         * XXX This can be killed once everyone converts
-        * it up and add the remainder to the error difference.
+         * to the new update_vsyscall.
-        * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+         */
-        * by truncating the remainder in vsyscalls. However, it causes
+        old_vsyscall_fixup(tk);
-        * additional work to be done in timekeeping_adjust(). Once
-        * the vsyscall implementations are converted to use xtime_nsec
-        * (shifted nanoseconds), this can be killed.
-        */
-        remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
-        tk->xtime_nsec -= remainder;
-        tk->xtime_nsec += 1ULL << tk->shift;
-        tk->ntp_error += remainder << tk->ntp_error_shift;
        /*
         * Finally, make sure that after the rounding
diff --git a/kernel/timer.c b/kernel/timer.c
index d5de1b2292aa..367d00858482 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVR_SIZE (1 << TVR_BITS)
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
+#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
 struct tvec {
        struct list_head vec[TVN_SIZE];
@@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
                vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
        } else {
                int i;
-                /* If the timeout is larger than 0xffffffff on 64-bit
+                /* If the timeout is larger than MAX_TVAL (on 64-bit
-                 * architectures then we use the maximum timeout:
+                 * architectures or with CONFIG_BASE_SMALL=1) then we
+                 * use the maximum timeout.
                 */
-                if (idx > 0xffffffffUL) {
+                if (idx > MAX_TVAL) {
-                        idx = 0xffffffffUL;
+                        idx = MAX_TVAL;
                        expires = idx + base->timer_jiffies;
                }
                i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ec5c1dab629..31e4f55773f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2061,7 +2061,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
        seq_puts(m, "#    -----------------\n");
        seq_printf(m, "#    | task: %.16s-%d "
                   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
-                   data->comm, data->pid, data->uid, data->nice,
+                   data->comm, data->pid,
+                   from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
                   data->policy, data->rt_priority);
        seq_puts(m, "#    -----------------\n");
@@ -4199,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
        buf->private = 0;
 }
-static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
-                                 struct pipe_buffer *buf)
-{
-        return 1;
-}
 static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
                                struct pipe_buffer *buf)
 {
@@ -4220,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
        .unmap                  = generic_pipe_buf_unmap,
        .confirm                = generic_pipe_buf_confirm,
        .release                = buffer_pipe_buf_release,
-        .steal                  = buffer_pipe_buf_steal,
+        .steal                  = generic_pipe_buf_steal,
        .get                    = buffer_pipe_buf_get,
 };
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 63a2da0b9a6e..c15f528c1af4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -147,7 +147,7 @@ struct trace_array_cpu {
        unsigned long           skipped_entries;
        cycle_t                 preempt_timestamp;
        pid_t                   pid;
-        uid_t                   uid;
+        kuid_t                  uid;
        char                    comm[TASK_COMM_LEN];
 };
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 483162a9f908..507a7a9630bf 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,7 +13,6 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <linux/pstore.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -76,10 +75,9 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
        preempt_enable_notrace();
 }
-/* Our two options */
+/* Our option */
 enum {
        TRACE_FUNC_OPT_STACK    = 0x1,
-        TRACE_FUNC_OPT_PSTORE   = 0x2,
 };
 static struct tracer_flags func_flags;
@@ -109,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
        disabled = atomic_inc_return(&data->disabled);
        if (likely(disabled == 1)) {
-                /*
-                 * So far tracing doesn't support multiple buffers, so
-                 * we make an explicit call for now.
-                 */
-                if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
-                        pstore_ftrace_call(ip, parent_ip);
                pc = preempt_count();
                trace_function(tr, ip, parent_ip, flags, pc);
        }
@@ -181,9 +173,6 @@ static struct tracer_opt func_opts[] = {
 #ifdef CONFIG_STACKTRACE
        { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
 #endif
-#ifdef CONFIG_PSTORE_FTRACE
-        { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
-#endif
        { } /* Always set a last empty entry */
 };
@@ -236,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
                }
                break;
-        case TRACE_FUNC_OPT_PSTORE:
-                break;
        default:
                return -EINVAL;
        }
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 23b4d784ebdd..625df0b44690 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -26,7 +26,9 @@
 /*
 * fill in basic accounting fields
 */
-void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+void bacct_add_tsk(struct user_namespace *user_ns,
+                   struct pid_namespace *pid_ns,
+                   struct taskstats *stats, struct task_struct *tsk)
 {
        const struct cred *tcred;
        struct timespec uptime, ts;
@@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
                stats->ac_flag |= AXSIG;
        stats->ac_nice   = task_nice(tsk);
        stats->ac_sched  = tsk->policy;
-        stats->ac_pid    = tsk->pid;
+        stats->ac_pid    = task_pid_nr_ns(tsk, pid_ns);
        rcu_read_lock();
        tcred = __task_cred(tsk);
-        stats->ac_uid    = tcred->uid;
+        stats->ac_uid    = from_kuid_munged(user_ns, tcred->uid);
-        stats->ac_gid    = tcred->gid;
+        stats->ac_gid    = from_kgid_munged(user_ns, tcred->gid);
        stats->ac_ppid   = pid_alive(tsk) ?
-                                rcu_dereference(tsk->real_parent)->tgid : 0;
+                task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
        rcu_read_unlock();
        stats->ac_utime = cputime_to_usecs(tsk->utime);
        stats->ac_stime = cputime_to_usecs(tsk->stime);
diff --git a/kernel/user.c b/kernel/user.c
index b815fefbe76f..750acffbe9ec 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -38,6 +38,14 @@ struct user_namespace init_user_ns = {
                        .count = 4294967295U,
                },
        },
+        .projid_map = {
+                .nr_extents = 1,
+                .extent[0] = {
+                        .first = 0,
+                        .lower_first = 0,
+                        .count = 4294967295U,
+                },
+        },
        .kref = {
                .refcount       = ATOMIC_INIT(3),
        },
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 86602316422d..456a6b9fba34 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -19,6 +19,7 @@
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/projid.h>
 static struct kmem_cache *user_ns_cachep __read_mostly;
@@ -295,6 +296,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
 }
 EXPORT_SYMBOL(from_kgid_munged);
+/**
+ *      make_kprojid - Map a user-namespace projid pair into a kprojid.
+ *      @ns:  User namespace that the projid is in
+ *      @projid: Project identifier
+ *
+ *      Maps a user-namespace uid pair into a kernel internal kuid,
+ *      and returns that kuid.
+ *
+ *      When there is no mapping defined for the user-namespace projid
+ *      pair INVALID_PROJID is returned.  Callers are expected to test
+ *      for and handle handle INVALID_PROJID being returned.  INVALID_PROJID
+ *      may be tested for using projid_valid().
+ */
+kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
+{
+        /* Map the uid to a global kernel uid */
+        return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
+}
+EXPORT_SYMBOL(make_kprojid);
+/**
+ *      from_kprojid - Create a projid from a kprojid user-namespace pair.
+ *      @targ: The user namespace we want a projid in.
+ *      @kprojid: The kernel internal project identifier to start with.
+ *
+ *      Map @kprojid into the user-namespace specified by @targ and
+ *      return the resulting projid.
+ *
+ *      There is always a mapping into the initial user_namespace.
+ *
+ *      If @kprojid has no mapping in @targ (projid_t)-1 is returned.
+ */
+projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
+{
+        /* Map the uid from a global kernel uid */
+        return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
+}
+EXPORT_SYMBOL(from_kprojid);
+/**
+ *      from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
+ *      @targ: The user namespace we want a projid in.
+ *      @kprojid: The kernel internal projid to start with.
+ *
+ *      Map @kprojid into the user-namespace specified by @targ and
+ *      return the resulting projid.
+ *
+ *      There is always a mapping into the initial user_namespace.
+ *
+ *      Unlike from_kprojid from_kprojid_munged never fails and always
+ *      returns a valid projid.  This makes from_kprojid_munged
+ *      appropriate for use in syscalls like stat and where
+ *      failing the system call and failing to provide a valid projid are
+ *      not an options.
+ *
+ *      If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
+ */
+projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
+{
+        projid_t projid;
+        projid = from_kprojid(targ, kprojid);
+        if (projid == (projid_t) -1)
+                projid = OVERFLOW_PROJID;
+        return projid;
+}
+EXPORT_SYMBOL(from_kprojid_munged);
 static int uid_m_show(struct seq_file *seq, void *v)
 {
        struct user_namespace *ns = seq->private;
@@ -337,6 +407,27 @@ static int gid_m_show(struct seq_file *seq, void *v)
        return 0;
 }
+static int projid_m_show(struct seq_file *seq, void *v)
+{
+        struct user_namespace *ns = seq->private;
+        struct uid_gid_extent *extent = v;
+        struct user_namespace *lower_ns;
+        projid_t lower;
+        lower_ns = seq_user_ns(seq);
+        if ((lower_ns == ns) && lower_ns->parent)
+                lower_ns = lower_ns->parent;
+        lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
+        seq_printf(seq, "%10u %10u %10u\n",
+                extent->first,
+                lower,
+                extent->count);
+        return 0;
+}
 static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
 {
        struct uid_gid_extent *extent = NULL;
@@ -362,6 +453,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
        return m_start(seq, ppos, &ns->gid_map);
 }
+static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+        struct user_namespace *ns = seq->private;
+        return m_start(seq, ppos, &ns->projid_map);
+}
 static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        (*pos)++;
@@ -387,6 +485,13 @@ struct seq_operations proc_gid_seq_operations = {
        .show = gid_m_show,
 };
+struct seq_operations proc_projid_seq_operations = {
+        .start = projid_m_start,
+        .stop = m_stop,
+        .next = m_next,
+        .show = projid_m_show,
+};
 static DEFINE_MUTEX(id_map_mutex);
 static ssize_t map_write(struct file *file, const char __user *buf,
@@ -434,7 +539,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
         * over the user namespace in order to set the id mapping.
         */
-        if (!ns_capable(ns, cap_setid))
+        if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
                goto out;
        /* Get a buffer */
@@ -584,9 +689,30 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
                         &ns->gid_map, &ns->parent->gid_map);
 }
+ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+{
+        struct seq_file *seq = file->private_data;
+        struct user_namespace *ns = seq->private;
+        struct user_namespace *seq_ns = seq_user_ns(seq);
+        if (!ns->parent)
+                return -EPERM;
+        if ((seq_ns != ns) && (seq_ns != ns->parent))
+                return -EPERM;
+        /* Anyone can set any valid project id no capability needed */
+        return map_write(file, buf, size, ppos, -1,
+                         &ns->projid_map, &ns->parent->projid_map);
+}
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
 {
+        /* Allow anyone to set a mapping that doesn't require privilege */
+        if (!cap_valid(cap_setid))
+                return true;
        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
         */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3c5a79e2134c..d951daa0ca9a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -58,7 +58,7 @@ enum {
         * be executing on any CPU.  The gcwq behaves as an unbound one.
         *
         * Note that DISASSOCIATED can be flipped only while holding
-         * managership of all pools on the gcwq to avoid changing binding
+         * assoc_mutex of all pools on the gcwq to avoid changing binding
         * state while create_worker() is in progress.
         */
        GCWQ_DISASSOCIATED      = 1 << 0,       /* cpu can't serve workers */
@@ -73,11 +73,10 @@ enum {
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
-        WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
-        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
+        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_UNBOUND |
                                  WORKER_CPU_INTENSIVE,
        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
@@ -126,7 +125,6 @@ enum {
 struct global_cwq;
 struct worker_pool;
-struct idle_rebind;
 /*
 * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -150,7 +148,6 @@ struct worker {
        int                     id;             /* I: worker id */
        /* for rebinding worker to CPU */
-        struct idle_rebind      *idle_rebind;   /* L: for idle worker */
        struct work_struct      rebind_work;    /* L: for busy worker */
 };
@@ -160,13 +157,15 @@ struct worker_pool {
        struct list_head        worklist;       /* L: list of pending works */
        int                     nr_workers;     /* L: total number of workers */
+        /* nr_idle includes the ones off idle_list for rebinding */
        int                     nr_idle;        /* L: currently idle ones */
        struct list_head        idle_list;      /* X: list of idle workers */
        struct timer_list       idle_timer;     /* L: worker idle timeout */
        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
-        struct mutex            manager_mutex;  /* mutex manager should hold */
+        struct mutex            assoc_mutex;    /* protect GCWQ_DISASSOCIATED */
        struct ida              worker_ida;     /* L: for worker IDs */
 };
@@ -184,9 +183,8 @@ struct global_cwq {
        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
-        struct worker_pool      pools[2];       /* normal and highpri pools */
+        struct worker_pool      pools[NR_WORKER_POOLS];
+                                                /* normal and highpri pools */
-        wait_queue_head_t       rebind_hold;    /* rebind hold wait */
 } ____cacheline_aligned_in_smp;
 /*
@@ -269,17 +267,15 @@ struct workqueue_struct {
 };
 struct workqueue_struct *system_wq __read_mostly;
-struct workqueue_struct *system_long_wq __read_mostly;
-struct workqueue_struct *system_nrt_wq __read_mostly;
-struct workqueue_struct *system_unbound_wq __read_mostly;
-struct workqueue_struct *system_freezable_wq __read_mostly;
-struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
+struct workqueue_struct *system_highpri_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_highpri_wq);
+struct workqueue_struct *system_long_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_long_wq);
-EXPORT_SYMBOL_GPL(system_nrt_wq);
+struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
-EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -534,18 +530,24 @@ static int work_next_color(int color)
 }
 /*
- * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
+ * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
- * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
+ * contain the pointer to the queued cwq.  Once execution starts, the flag
- * cleared and the work data contains the cpu number it was last on.
+ * is cleared and the high bits contain OFFQ flags and CPU number.
 *
- * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
+ * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
- * cwq, cpu or clear work->data.  These functions should only be
+ * and clear_work_data() can be used to set the cwq, cpu or clear
- * called while the work is owned - ie. while the PENDING bit is set.
+ * work->data.  These functions should only be called while the work is
+ * owned - ie. while the PENDING bit is set.
 *
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
- * corresponding to a work.  gcwq is available once the work has been
+ * a work.  gcwq is available once the work has been queued anywhere after
- * queued anywhere after initialization.  cwq is available only from
+ * initialization until it is sync canceled.  cwq is available only while
- * queueing until execution starts.
+ * the work item is queued.
+ *
+ * %WORK_OFFQ_CANCELING is used to mark a work item which is being
+ * canceled.  While being canceled, a work item may have its PENDING set
+ * but stay off timer and worklist for arbitrarily long and nobody should
+ * try to steal the PENDING bit.
 */
 static inline void set_work_data(struct work_struct *work, unsigned long data,
                                 unsigned long flags)
@@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work,
                      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
 }
-static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+static void set_work_cpu_and_clear_pending(struct work_struct *work,
+                                           unsigned int cpu)
 {
-        set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+        /*
+         * The following wmb is paired with the implied mb in
+         * test_and_set_bit(PENDING) and ensures all updates to @work made
+         * here are visible to and precede any updates by the next PENDING
+         * owner.
+         */
+        smp_wmb();
+        set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
 }
 static void clear_work_data(struct work_struct *work)
 {
+        smp_wmb();      /* see set_work_cpu_and_clear_pending() */
        set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 }
@@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
                return ((struct cpu_workqueue_struct *)
                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
-        cpu = data >> WORK_STRUCT_FLAG_BITS;
+        cpu = data >> WORK_OFFQ_CPU_SHIFT;
        if (cpu == WORK_CPU_NONE)
                return NULL;
@@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
        return get_gcwq(cpu);
 }
+static void mark_work_canceling(struct work_struct *work)
+{
+        struct global_cwq *gcwq = get_work_gcwq(work);
+        unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
+        set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
+                      WORK_STRUCT_PENDING);
+}
+static bool work_is_canceling(struct work_struct *work)
+{
+        unsigned long data = atomic_long_read(&work->data);
+        return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
+}
 /*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
@@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool)
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;
+        /*
+         * nr_idle and idle_list may disagree if idle rebinding is in
+         * progress.  Never return %true if idle_list is empty.
+         */
+        if (list_empty(&pool->idle_list))
+                return false;
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -903,6 +937,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 /**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head.  Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work.  This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+                              struct work_struct **nextp)
+{
+        struct work_struct *n;
+        /*
+         * Linked worklist will always end before the end of the list,
+         * use NULL for list head.
+         */
+        list_for_each_entry_safe_from(work, n, NULL, entry) {
+                list_move_tail(&work->entry, head);
+                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+                        break;
+        }
+        /*
+         * If we're already inside safe list traversal and have moved
+         * multiple works to the scheduled queue, the next position
+         * needs to be updated.
+         */
+        if (nextp)
+                *nextp = n;
+}
+static void cwq_activate_delayed_work(struct work_struct *work)
+{
+        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        trace_workqueue_activate_work(work);
+        move_linked_works(work, &cwq->pool->worklist, NULL);
+        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+        cwq->nr_active++;
+}
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+        struct work_struct *work = list_first_entry(&cwq->delayed_works,
+                                                    struct work_struct, entry);
+        cwq_activate_delayed_work(work);
+}
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+{
+        /* ignore uncolored works */
+        if (color == WORK_NO_COLOR)
+                return;
+        cwq->nr_in_flight[color]--;
+        cwq->nr_active--;
+        if (!list_empty(&cwq->delayed_works)) {
+                /* one down, submit a delayed one */
+                if (cwq->nr_active < cwq->max_active)
+                        cwq_activate_first_delayed(cwq);
+        }
+        /* is flush in progress and are we at the flushing tip? */
+        if (likely(cwq->flush_color != color))
+                return;
+        /* are there still in-flight works? */
+        if (cwq->nr_in_flight[color])
+                return;
+        /* this cwq is done, clear flush_color */
+        cwq->flush_color = -1;
+        /*
+         * If this was the last cwq, wake up the first flusher.  It
+         * will handle the rest.
+         */
+        if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+                complete(&cwq->wq->first_flusher->done);
+}
+/**
+ * try_to_grab_pending - steal work item from worklist and disable irq
+ * @work: work item to steal
+ * @is_dwork: @work is a delayed_work
+ * @flags: place to store irq state
+ *
+ * Try to grab PENDING bit of @work.  This function can handle @work in any
+ * stable state - idle, on timer or on worklist.  Return values are
+ *
+ *  1           if @work was pending and we successfully stole PENDING
+ *  0           if @work was idle and we claimed PENDING
+ *  -EAGAIN     if PENDING couldn't be grabbed at the moment, safe to busy-retry
+ *  -ENOENT     if someone else is canceling @work, this state may persist
+ *              for arbitrarily long
+ *
+ * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
+ * interrupted while holding PENDING and @work off queue, irq must be
+ * disabled on entry.  This, combined with delayed_work->timer being
+ * irqsafe, ensures that we return -EAGAIN for finite short period of time.
+ *
+ * On successful return, >= 0, irq is disabled and the caller is
+ * responsible for releasing it using local_irq_restore(*@flags).
+ *
+ * This function is safe to call from any context including IRQ handler.
+ */
+static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
+                               unsigned long *flags)
+{
+        struct global_cwq *gcwq;
+        local_irq_save(*flags);
+        /* try to steal the timer if it exists */
+        if (is_dwork) {
+                struct delayed_work *dwork = to_delayed_work(work);
+                /*
+                 * dwork->timer is irqsafe.  If del_timer() fails, it's
+                 * guaranteed that the timer is not queued anywhere and not
+                 * running on the local CPU.
+                 */
+                if (likely(del_timer(&dwork->timer)))
+                        return 1;
+        }
+        /* try to claim PENDING the normal way */
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
+                return 0;
+        /*
+         * The queueing is in progress, or it is already queued. Try to
+         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+         */
+        gcwq = get_work_gcwq(work);
+        if (!gcwq)
+                goto fail;
+        spin_lock(&gcwq->lock);
+        if (!list_empty(&work->entry)) {
+                /*
+                 * This work is queued, but perhaps we locked the wrong gcwq.
+                 * In that case we must see the new value after rmb(), see
+                 * insert_work()->wmb().
+                 */
+                smp_rmb();
+                if (gcwq == get_work_gcwq(work)) {
+                        debug_work_deactivate(work);
+                        /*
+                         * A delayed work item cannot be grabbed directly
+                         * because it might have linked NO_COLOR work items
+                         * which, if left on the delayed_list, will confuse
+                         * cwq->nr_active management later on and cause
+                         * stall.  Make sure the work item is activated
+                         * before grabbing.
+                         */
+                        if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
+                                cwq_activate_delayed_work(work);
+                        list_del_init(&work->entry);
+                        cwq_dec_nr_in_flight(get_work_cwq(work),
+                                get_work_color(work));
+                        spin_unlock(&gcwq->lock);
+                        return 1;
+                }
+        }
+        spin_unlock(&gcwq->lock);
+fail:
+        local_irq_restore(*flags);
+        if (work_is_canceling(work))
+                return -ENOENT;
+        cpu_relax();
+        return -EAGAIN;
+}
+/**
 * insert_work - insert a work into gcwq
 * @cwq: cwq @work belongs to
 * @work: work to insert
@@ -982,7 +1216,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        struct cpu_workqueue_struct *cwq;
        struct list_head *worklist;
        unsigned int work_flags;
-        unsigned long flags;
+        unsigned int req_cpu = cpu;
+        /*
+         * While a work item is PENDING && off queue, a task trying to
+         * steal the PENDING will busy-loop waiting for it to either get
+         * queued or lose PENDING.  Grabbing PENDING and queueing should
+         * happen with IRQ disabled.
+         */
+        WARN_ON_ONCE(!irqs_disabled());
        debug_work_activate(work);
@@ -995,21 +1237,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (!(wq->flags & WQ_UNBOUND)) {
                struct global_cwq *last_gcwq;
-                if (unlikely(cpu == WORK_CPU_UNBOUND))
+                if (cpu == WORK_CPU_UNBOUND)
                        cpu = raw_smp_processor_id();
                /*
-                 * It's multi cpu.  If @wq is non-reentrant and @work
+                 * It's multi cpu.  If @work was previously on a different
-                 * was previously on a different cpu, it might still
+                 * cpu, it might still be running there, in which case the
-                 * be running there, in which case the work needs to
+                 * work needs to be queued on that cpu to guarantee
-                 * be queued on that cpu to guarantee non-reentrance.
+                 * non-reentrancy.
                 */
                gcwq = get_gcwq(cpu);
-                if (wq->flags & WQ_NON_REENTRANT &&
+                last_gcwq = get_work_gcwq(work);
-                    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+                if (last_gcwq && last_gcwq != gcwq) {
                        struct worker *worker;
-                        spin_lock_irqsave(&last_gcwq->lock, flags);
+                        spin_lock(&last_gcwq->lock);
                        worker = find_worker_executing_work(last_gcwq, work);
@@ -1017,22 +1260,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                                gcwq = last_gcwq;
                        else {
                                /* meh... not running there, queue here */
-                                spin_unlock_irqrestore(&last_gcwq->lock, flags);
+                                spin_unlock(&last_gcwq->lock);
-                                spin_lock_irqsave(&gcwq->lock, flags);
+                                spin_lock(&gcwq->lock);
                        }
-                } else
+                } else {
-                        spin_lock_irqsave(&gcwq->lock, flags);
+                        spin_lock(&gcwq->lock);
+                }
        } else {
                gcwq = get_gcwq(WORK_CPU_UNBOUND);
-                spin_lock_irqsave(&gcwq->lock, flags);
+                spin_lock(&gcwq->lock);
        }
        /* gcwq determined, get cwq and queue */
        cwq = get_cwq(gcwq->cpu, wq);
-        trace_workqueue_queue_work(cpu, cwq, work);
+        trace_workqueue_queue_work(req_cpu, cwq, work);
        if (WARN_ON(!list_empty(&work->entry))) {
-                spin_unlock_irqrestore(&gcwq->lock, flags);
+                spin_unlock(&gcwq->lock);
                return;
        }
@@ -1050,79 +1294,110 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        insert_work(cwq, work, worklist, work_flags);
-        spin_unlock_irqrestore(&gcwq->lock, flags);
+        spin_unlock(&gcwq->lock);
 }
 /**
- * queue_work - queue work on a workqueue
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
 *
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
+ * We queue the work to a specific CPU, the caller must ensure it
- * it can be processed by another CPU.
+ * can't go away.
 */
-int queue_work(struct workqueue_struct *wq, struct work_struct *work)
+bool queue_work_on(int cpu, struct workqueue_struct *wq,
+                   struct work_struct *work)
 {
-        int ret;
+        bool ret = false;
+        unsigned long flags;
-        ret = queue_work_on(get_cpu(), wq, work);
+        local_irq_save(flags);
-        put_cpu();
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+                __queue_work(cpu, wq, work);
+                ret = true;
+        }
+        local_irq_restore(flags);
        return ret;
 }
-EXPORT_SYMBOL_GPL(queue_work);
+EXPORT_SYMBOL_GPL(queue_work_on);
 /**
- * queue_work_on - queue work on specific cpu
+ * queue_work - queue work on a workqueue
- * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
 *
- * We queue the work to a specific CPU, the caller must ensure it
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * can't go away.
+ * it can be processed by another CPU.
 */
-int
+bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
-queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 {
-        int ret = 0;
+        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
-        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-                __queue_work(cpu, wq, work);
-                ret = 1;
-        }
-        return ret;
 }
-EXPORT_SYMBOL_GPL(queue_work_on);
+EXPORT_SYMBOL_GPL(queue_work);
-static void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
        struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
-        __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
+        /* should have been called from irqsafe timer with irq already off */
+        __queue_work(dwork->cpu, cwq->wq, &dwork->work);
 }
+EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
-/**
+static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
- * queue_delayed_work - queue work on a workqueue after delay
+                                struct delayed_work *dwork, unsigned long delay)
- * @wq: workqueue to use
- * @dwork: delayable work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- */
-int queue_delayed_work(struct workqueue_struct *wq,
-                        struct delayed_work *dwork, unsigned long delay)
 {
-        if (delay == 0)
+        struct timer_list *timer = &dwork->timer;
-                return queue_work(wq, &dwork->work);
+        struct work_struct *work = &dwork->work;
+        unsigned int lcpu;
+        WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
+                     timer->data != (unsigned long)dwork);
+        BUG_ON(timer_pending(timer));
+        BUG_ON(!list_empty(&work->entry));
+        timer_stats_timer_set_start_info(&dwork->timer);
+        /*
+         * This stores cwq for the moment, for the timer_fn.  Note that the
+         * work's gcwq is preserved to allow reentrance detection for
+         * delayed works.
+         */
+        if (!(wq->flags & WQ_UNBOUND)) {
+                struct global_cwq *gcwq = get_work_gcwq(work);
-        return queue_delayed_work_on(-1, wq, dwork, delay);
+                /*
+                 * If we cannot get the last gcwq from @work directly,
+                 * select the last CPU such that it avoids unnecessarily
+                 * triggering non-reentrancy check in __queue_work().
+                 */
+                lcpu = cpu;
+                if (gcwq)
+                        lcpu = gcwq->cpu;
+                if (lcpu == WORK_CPU_UNBOUND)
+                        lcpu = raw_smp_processor_id();
+        } else {
+                lcpu = WORK_CPU_UNBOUND;
+        }
+        set_work_cwq(work, get_cwq(lcpu, wq), 0);
+        dwork->cpu = cpu;
+        timer->expires = jiffies + delay;
+        if (unlikely(cpu != WORK_CPU_UNBOUND))
+                add_timer_on(timer, cpu);
+        else
+                add_timer(timer);
 }
-EXPORT_SYMBOL_GPL(queue_delayed_work);
 /**
 * queue_delayed_work_on - queue work on specific CPU after delay
@@ -1131,53 +1406,100 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.  If
+ * @delay is zero and @dwork is idle, it will be scheduled for immediate
+ * execution.
 */
-int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-                        struct delayed_work *dwork, unsigned long delay)
+                           struct delayed_work *dwork, unsigned long delay)
 {
-        int ret = 0;
-        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
+        bool ret = false;
+        unsigned long flags;
-        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+        if (!delay)
-                unsigned int lcpu;
+                return queue_work_on(cpu, wq, &dwork->work);
-                BUG_ON(timer_pending(timer));
+        /* read the comment in __queue_work() */
-                BUG_ON(!list_empty(&work->entry));
+        local_irq_save(flags);
-                timer_stats_timer_set_start_info(&dwork->timer);
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+                __queue_delayed_work(cpu, wq, dwork, delay);
+                ret = true;
+        }
-                /*
+        local_irq_restore(flags);
-                 * This stores cwq for the moment, for the timer_fn.
+        return ret;
-                 * Note that the work's gcwq is preserved to allow
+}
-                 * reentrance detection for delayed works.
+EXPORT_SYMBOL_GPL(queue_delayed_work_on);
-                 */
-                if (!(wq->flags & WQ_UNBOUND)) {
-                        struct global_cwq *gcwq = get_work_gcwq(work);
-                        if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+/**
-                                lcpu = gcwq->cpu;
+ * queue_delayed_work - queue work on a workqueue after delay
-                        else
+ * @wq: workqueue to use
-                                lcpu = raw_smp_processor_id();
+ * @dwork: delayable work to queue
-                } else
+ * @delay: number of jiffies to wait before queueing
-                        lcpu = WORK_CPU_UNBOUND;
+ *
+ * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
+ */
+bool queue_delayed_work(struct workqueue_struct *wq,
+                        struct delayed_work *dwork, unsigned long delay)
+{
+        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work);
-                set_work_cwq(work, get_cwq(lcpu, wq), 0);
+/**
+ * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
+ * modify @dwork's timer so that it expires after @delay.  If @delay is
+ * zero, @work is guaranteed to be scheduled immediately regardless of its
+ * current state.
+ *
+ * Returns %false if @dwork was idle and queued, %true if @dwork was
+ * pending and its timer was modified.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ * See try_to_grab_pending() for details.
+ */
+bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
+                         struct delayed_work *dwork, unsigned long delay)
+{
+        unsigned long flags;
+        int ret;
-                timer->expires = jiffies + delay;
+        do {
-                timer->data = (unsigned long)dwork;
+                ret = try_to_grab_pending(&dwork->work, true, &flags);
-                timer->function = delayed_work_timer_fn;
+        } while (unlikely(ret == -EAGAIN));
-                if (unlikely(cpu >= 0))
+        if (likely(ret >= 0)) {
-                        add_timer_on(timer, cpu);
+                __queue_delayed_work(cpu, wq, dwork, delay);
-                else
+                local_irq_restore(flags);
-                        add_timer(timer);
-                ret = 1;
        }
+        /* -ENOENT from try_to_grab_pending() becomes %true */
        return ret;
 }
-EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+EXPORT_SYMBOL_GPL(mod_delayed_work_on);
+/**
+ * mod_delayed_work - modify delay of or queue a delayed work
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * mod_delayed_work_on() on local CPU.
+ */
+bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
+                      unsigned long delay)
+{
+        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(mod_delayed_work);
 /**
 * worker_enter_idle - enter idle state
@@ -1305,37 +1627,21 @@ __acquires(&gcwq->lock)
        }
 }
-struct idle_rebind {
-        int                     cnt;            /* # workers to be rebound */
-        struct completion       done;           /* all workers rebound */
-};
 /*
- * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
+ * Rebind an idle @worker to its CPU.  worker_thread() will test
- * happen synchronously for idle workers.  worker_thread() will test
+ * list_empty(@worker->entry) before leaving idle and call this function.
- * %WORKER_REBIND before leaving idle and call this function.
 */
 static void idle_worker_rebind(struct worker *worker)
 {
        struct global_cwq *gcwq = worker->pool->gcwq;
-        /* CPU must be online at this point */
+        /* CPU may go down again inbetween, clear UNBOUND only on success */
-        WARN_ON(!worker_maybe_bind_and_lock(worker));
+        if (worker_maybe_bind_and_lock(worker))
-        if (!--worker->idle_rebind->cnt)
+                worker_clr_flags(worker, WORKER_UNBOUND);
-                complete(&worker->idle_rebind->done);
-        spin_unlock_irq(&worker->pool->gcwq->lock);
-        /* we did our part, wait for rebind_workers() to finish up */
+        /* rebind complete, become available again */
-        wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+        list_add(&worker->entry, &worker->pool->idle_list);
+        spin_unlock_irq(&gcwq->lock);
-        /*
-         * rebind_workers() shouldn't finish until all workers passed the
-         * above WORKER_REBIND wait.  Tell it when done.
-         */
-        spin_lock_irq(&worker->pool->gcwq->lock);
-        if (!--worker->idle_rebind->cnt)
-                complete(&worker->idle_rebind->done);
-        spin_unlock_irq(&worker->pool->gcwq->lock);
 }
 /*
@@ -1349,16 +1655,8 @@ static void busy_worker_rebind_fn(struct work_struct *work)
        struct worker *worker = container_of(work, struct worker, rebind_work);
        struct global_cwq *gcwq = worker->pool->gcwq;
-        worker_maybe_bind_and_lock(worker);
+        if (worker_maybe_bind_and_lock(worker))
+                worker_clr_flags(worker, WORKER_UNBOUND);
-        /*
-         * %WORKER_REBIND must be cleared even if the above binding failed;
-         * otherwise, we may confuse the next CPU_UP cycle or oops / get
-         * stuck by calling idle_worker_rebind() prematurely.  If CPU went
-         * down again inbetween, %WORKER_UNBOUND would be set, so clearing
-         * %WORKER_REBIND is always safe.
-         */
-        worker_clr_flags(worker, WORKER_REBIND);
        spin_unlock_irq(&gcwq->lock);
 }
@@ -1370,123 +1668,74 @@ static void busy_worker_rebind_fn(struct work_struct *work)
 * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
 * is different for idle and busy ones.
 *
- * The idle ones should be rebound synchronously and idle rebinding should
+ * Idle ones will be removed from the idle_list and woken up.  They will
- * be complete before any worker starts executing work items with
+ * add themselves back after completing rebind.  This ensures that the
- * concurrency management enabled; otherwise, scheduler may oops trying to
+ * idle_list doesn't contain any unbound workers when re-bound busy workers
- * wake up non-local idle worker from wq_worker_sleeping().
+ * try to perform local wake-ups for concurrency management.
 *
- * This is achieved by repeatedly requesting rebinding until all idle
+ * Busy workers can rebind after they finish their current work items.
- * workers are known to have been rebound under @gcwq->lock and holding all
+ * Queueing the rebind work item at the head of the scheduled list is
- * idle workers from becoming busy until idle rebinding is complete.
+ * enough.  Note that nr_running will be properly bumped as busy workers
+ * rebind.
 *
- * Once idle workers are rebound, busy workers can be rebound as they
+ * On return, all non-manager workers are scheduled for rebind - see
- * finish executing their current work items.  Queueing the rebind work at
+ * manage_workers() for the manager special case.  Any idle worker
- * the head of their scheduled lists is enough.  Note that nr_running will
+ * including the manager will not appear on @idle_list until rebind is
- * be properbly bumped as busy workers rebind.
+ * complete, making local wake-ups safe.
- *
- * On return, all workers are guaranteed to either be bound or have rebind
- * work item scheduled.
 */
 static void rebind_workers(struct global_cwq *gcwq)
-        __releases(&gcwq->lock) __acquires(&gcwq->lock)
 {
-        struct idle_rebind idle_rebind;
        struct worker_pool *pool;
-        struct worker *worker;
+        struct worker *worker, *n;
        struct hlist_node *pos;
        int i;
        lockdep_assert_held(&gcwq->lock);
        for_each_worker_pool(pool, gcwq)
-                lockdep_assert_held(&pool->manager_mutex);
+                lockdep_assert_held(&pool->assoc_mutex);
-        /*
+        /* dequeue and kick idle ones */
-         * Rebind idle workers.  Interlocked both ways.  We wait for
-         * workers to rebind via @idle_rebind.done.  Workers will wait for
-         * us to finish up by watching %WORKER_REBIND.
-         */
-        init_completion(&idle_rebind.done);
-retry:
-        idle_rebind.cnt = 1;
-        INIT_COMPLETION(idle_rebind.done);
-        /* set REBIND and kick idle ones, we'll wait for these later */
        for_each_worker_pool(pool, gcwq) {
-                list_for_each_entry(worker, &pool->idle_list, entry) {
+                list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
-                        unsigned long worker_flags = worker->flags;
+                        /*
+                         * idle workers should be off @pool->idle_list
-                        if (worker->flags & WORKER_REBIND)
+                         * until rebind is complete to avoid receiving
-                                continue;
+                         * premature local wake-ups.
+                         */
-                        /* morph UNBOUND to REBIND atomically */
+                        list_del_init(&worker->entry);
-                        worker_flags &= ~WORKER_UNBOUND;
-                        worker_flags |= WORKER_REBIND;
-                        ACCESS_ONCE(worker->flags) = worker_flags;
-                        idle_rebind.cnt++;
-                        worker->idle_rebind = &idle_rebind;
-                        /* worker_thread() will call idle_worker_rebind() */
+                        /*
+                         * worker_thread() will see the above dequeuing
+                         * and call idle_worker_rebind().
+                         */
                        wake_up_process(worker->task);
                }
        }
-        if (--idle_rebind.cnt) {
+        /* rebind busy workers */
-                spin_unlock_irq(&gcwq->lock);
-                wait_for_completion(&idle_rebind.done);
-                spin_lock_irq(&gcwq->lock);
-                /* busy ones might have become idle while waiting, retry */
-                goto retry;
-        }
-        /* all idle workers are rebound, rebind busy workers */
        for_each_busy_worker(worker, i, pos, gcwq) {
                struct work_struct *rebind_work = &worker->rebind_work;
-                unsigned long worker_flags = worker->flags;
+                struct workqueue_struct *wq;
-                /* morph UNBOUND to REBIND atomically */
-                worker_flags &= ~WORKER_UNBOUND;
-                worker_flags |= WORKER_REBIND;
-                ACCESS_ONCE(worker->flags) = worker_flags;
                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
                                     work_data_bits(rebind_work)))
                        continue;
-                /* wq doesn't matter, use the default one */
                debug_work_activate(rebind_work);
-                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-                            worker->scheduled.next,
-                            work_color_to_flags(WORK_NO_COLOR));
-        }
-        /*
-         * All idle workers are rebound and waiting for %WORKER_REBIND to
-         * be cleared inside idle_worker_rebind().  Clear and release.
-         * Clearing %WORKER_REBIND from this foreign context is safe
-         * because these workers are still guaranteed to be idle.
-         *
-         * We need to make sure all idle workers passed WORKER_REBIND wait
-         * in idle_worker_rebind() before returning; otherwise, workers can
-         * get stuck at the wait if hotplug cycle repeats.
-         */
-        idle_rebind.cnt = 1;
-        INIT_COMPLETION(idle_rebind.done);
-        for_each_worker_pool(pool, gcwq) {
-                list_for_each_entry(worker, &pool->idle_list, entry) {
-                        worker->flags &= ~WORKER_REBIND;
-                        idle_rebind.cnt++;
-                }
-        }
-        wake_up_all(&gcwq->rebind_hold);
+                /*
+                 * wq doesn't really matter but let's keep @worker->pool
+                 * and @cwq->pool consistent for sanity.
+                 */
+                if (worker_pool_pri(worker->pool))
+                        wq = system_highpri_wq;
+                else
+                        wq = system_wq;
-        if (--idle_rebind.cnt) {
+                insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
-                spin_unlock_irq(&gcwq->lock);
+                        worker->scheduled.next,
-                wait_for_completion(&idle_rebind.done);
+                        work_color_to_flags(WORK_NO_COLOR));
-                spin_lock_irq(&gcwq->lock);
        }
 }
@@ -1844,22 +2093,22 @@ static bool manage_workers(struct worker *worker)
         * grab %POOL_MANAGING_WORKERS to achieve this because that can
         * lead to idle worker depletion (all become busy thinking someone
         * else is managing) which in turn can result in deadlock under
-         * extreme circumstances.  Use @pool->manager_mutex to synchronize
+         * extreme circumstances.  Use @pool->assoc_mutex to synchronize
         * manager against CPU hotplug.
         *
-         * manager_mutex would always be free unless CPU hotplug is in
+         * assoc_mutex would always be free unless CPU hotplug is in
         * progress.  trylock first without dropping @gcwq->lock.
         */
-        if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
+        if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
                spin_unlock_irq(&pool->gcwq->lock);
-                mutex_lock(&pool->manager_mutex);
+                mutex_lock(&pool->assoc_mutex);
                /*
                 * CPU hotplug could have happened while we were waiting
-                 * for manager_mutex.  Hotplug itself can't handle us
+                 * for assoc_mutex.  Hotplug itself can't handle us
                 * because manager isn't either on idle or busy list, and
                 * @gcwq's state and ours could have deviated.
                 *
-                 * As hotplug is now excluded via manager_mutex, we can
+                 * As hotplug is now excluded via assoc_mutex, we can
                 * simply try to bind.  It will succeed or fail depending
                 * on @gcwq's current state.  Try it and adjust
                 * %WORKER_UNBOUND accordingly.
@@ -1882,112 +2131,11 @@ static bool manage_workers(struct worker *worker)
        ret |= maybe_create_worker(pool);
        pool->flags &= ~POOL_MANAGING_WORKERS;
-        mutex_unlock(&pool->manager_mutex);
+        mutex_unlock(&pool->assoc_mutex);
        return ret;
 }
 /**
- * move_linked_works - move linked works to a list
- * @work: start of series of works to be scheduled
- * @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
- *
- * Schedule linked works starting from @work to @head.  Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work.  This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void move_linked_works(struct work_struct *work, struct list_head *head,
-                              struct work_struct **nextp)
-{
-        struct work_struct *n;
-        /*
-         * Linked worklist will always end before the end of the list,
-         * use NULL for list head.
-         */
-        list_for_each_entry_safe_from(work, n, NULL, entry) {
-                list_move_tail(&work->entry, head);
-                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
-                        break;
-        }
-        /*
-         * If we're already inside safe list traversal and have moved
-         * multiple works to the scheduled queue, the next position
-         * needs to be updated.
-         */
-        if (nextp)
-                *nextp = n;
-}
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
-{
-        struct work_struct *work = list_first_entry(&cwq->delayed_works,
-                                                    struct work_struct, entry);
-        trace_workqueue_activate_work(work);
-        move_linked_works(work, &cwq->pool->worklist, NULL);
-        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
-        cwq->nr_active++;
-}
-/**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
- * @cwq: cwq of interest
- * @color: color of work which left the queue
- * @delayed: for a delayed work
- *
- * A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
-                                 bool delayed)
-{
-        /* ignore uncolored works */
-        if (color == WORK_NO_COLOR)
-                return;
-        cwq->nr_in_flight[color]--;
-        if (!delayed) {
-                cwq->nr_active--;
-                if (!list_empty(&cwq->delayed_works)) {
-                        /* one down, submit a delayed one */
-                        if (cwq->nr_active < cwq->max_active)
-                                cwq_activate_first_delayed(cwq);
-                }
-        }
-        /* is flush in progress and are we at the flushing tip? */
-        if (likely(cwq->flush_color != color))
-                return;
-        /* are there still in-flight works? */
-        if (cwq->nr_in_flight[color])
-                return;
-        /* this cwq is done, clear flush_color */
-        cwq->flush_color = -1;
-        /*
-         * If this was the last cwq, wake up the first flusher.  It
-         * will handle the rest.
-         */
-        if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
-                complete(&cwq->wq->first_flusher->done);
-}
-/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
@@ -2030,7 +2178,7 @@ __acquires(&gcwq->lock)
         * necessary to avoid spurious warnings from rescuers servicing the
         * unbound or a disassociated gcwq.
         */
-        WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+        WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
                     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
                     raw_smp_processor_id() != gcwq->cpu);
@@ -2046,15 +2194,13 @@ __acquires(&gcwq->lock)
                return;
        }
-        /* claim and process */
+        /* claim and dequeue */
        debug_work_deactivate(work);
        hlist_add_head(&worker->hentry, bwh);
        worker->current_work = work;
        worker->current_cwq = cwq;
        work_color = get_work_color(work);
-        /* record the current cpu number in the work data and dequeue */
-        set_work_cpu(work, gcwq->cpu);
        list_del_init(&work->entry);
        /*
@@ -2071,9 +2217,16 @@ __acquires(&gcwq->lock)
        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
                wake_up_worker(pool);
+        /*
+         * Record the last CPU and clear PENDING which should be the last
+         * update to @work.  Also, do this inside @gcwq->lock so that
+         * PENDING and queued state changes happen together while IRQ is
+         * disabled.
+         */
+        set_work_cpu_and_clear_pending(work, gcwq->cpu);
        spin_unlock_irq(&gcwq->lock);
-        work_clear_pending(work);
        lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        trace_workqueue_execute_start(work);
@@ -2087,11 +2240,9 @@ __acquires(&gcwq->lock)
        lock_map_release(&cwq->wq->lockdep_map);
        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-                printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+                pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
-                       "%s/0x%08x/%d\n",
+                       "     last function: %pf\n",
-                       current->comm, preempt_count(), task_pid_nr(current));
+                       current->comm, preempt_count(), task_pid_nr(current), f);
-                printk(KERN_ERR "    last function: ");
-                print_symbol("%s\n", (unsigned long)f);
                debug_show_held_locks(current);
                dump_stack();
        }
@@ -2106,7 +2257,7 @@ __acquires(&gcwq->lock)
        hlist_del_init(&worker->hentry);
        worker->current_work = NULL;
        worker->current_cwq = NULL;
-        cwq_dec_nr_in_flight(cwq, work_color, false);
+        cwq_dec_nr_in_flight(cwq, work_color);
 }
 /**
@@ -2151,18 +2302,17 @@ static int worker_thread(void *__worker)
 woke_up:
        spin_lock_irq(&gcwq->lock);
-        /*
+        /* we are off idle list if destruction or rebind is requested */
-         * DIE can be set only while idle and REBIND set while busy has
+        if (unlikely(list_empty(&worker->entry))) {
-         * @worker->rebind_work scheduled.  Checking here is enough.
-         */
-        if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
                spin_unlock_irq(&gcwq->lock);
+                /* if DIE is set, destruction is requested */
                if (worker->flags & WORKER_DIE) {
                        worker->task->flags &= ~PF_WQ_WORKER;
                        return 0;
                }
+                /* otherwise, rebind */
                idle_worker_rebind(worker);
                goto woke_up;
        }
@@ -2645,8 +2795,8 @@ reflush:
                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
-                        pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
+                        pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
-                                   wq->name, flush_cnt);
+                                wq->name, flush_cnt);
                goto reflush;
        }
@@ -2657,8 +2807,7 @@ reflush:
 }
 EXPORT_SYMBOL_GPL(drain_workqueue);
-static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
-                             bool wait_executing)
 {
        struct worker *worker = NULL;
        struct global_cwq *gcwq;
@@ -2680,13 +2829,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                cwq = get_work_cwq(work);
                if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
                        goto already_gone;
-        } else if (wait_executing) {
+        } else {
                worker = find_worker_executing_work(gcwq, work);
                if (!worker)
                        goto already_gone;
                cwq = worker->current_cwq;
-        } else
+        }
-                goto already_gone;
        insert_wq_barrier(cwq, barr, work, worker);
        spin_unlock_irq(&gcwq->lock);
@@ -2713,15 +2861,8 @@ already_gone:
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
- * Wait until @work has finished execution.  This function considers
+ * Wait until @work has finished execution.  @work is guaranteed to be idle
- * only the last queueing instance of @work.  If @work has been
+ * on return if it hasn't been requeued since flush started.
- * enqueued across different CPUs on a non-reentrant workqueue or on
- * multiple workqueues, @work might still be executing on return on
- * some of the CPUs from earlier queueing.
- *
- * If @work was queued only on a non-reentrant, ordered or unbound
- * workqueue, @work is guaranteed to be idle on return if it hasn't
- * been requeued since flush started.
 *
 * RETURNS:
 * %true if flush_work() waited for the work to finish execution,
@@ -2734,140 +2875,36 @@ bool flush_work(struct work_struct *work)
        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);
-        if (start_flush_work(work, &barr, true)) {
+        if (start_flush_work(work, &barr)) {
                wait_for_completion(&barr.done);
                destroy_work_on_stack(&barr.work);
                return true;
-        } else
+        } else {
-                return false;
-}
-EXPORT_SYMBOL_GPL(flush_work);
-static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
-{
-        struct wq_barrier barr;
-        struct worker *worker;
-        spin_lock_irq(&gcwq->lock);
-        worker = find_worker_executing_work(gcwq, work);
-        if (unlikely(worker))
-                insert_wq_barrier(worker->current_cwq, &barr, work, worker);
-        spin_unlock_irq(&gcwq->lock);
-        if (unlikely(worker)) {
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
-                return true;
-        } else
                return false;
-}
-static bool wait_on_work(struct work_struct *work)
-{
-        bool ret = false;
-        int cpu;
-        might_sleep();
-        lock_map_acquire(&work->lockdep_map);
-        lock_map_release(&work->lockdep_map);
-        for_each_gcwq_cpu(cpu)
-                ret |= wait_on_cpu_work(get_gcwq(cpu), work);
-        return ret;
-}
-/**
- * flush_work_sync - wait until a work has finished execution
- * @work: the work to flush
- *
- * Wait until @work has finished execution.  On return, it's
- * guaranteed that all queueing instances of @work which happened
- * before this function is called are finished.  In other words, if
- * @work hasn't been requeued since this function was called, @work is
- * guaranteed to be idle on return.
- *
- * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
- */
-bool flush_work_sync(struct work_struct *work)
-{
-        struct wq_barrier barr;
-        bool pending, waited;
-        /* we'll wait for executions separately, queue barr only if pending */
-        pending = start_flush_work(work, &barr, false);
-        /* wait for executions to finish */
-        waited = wait_on_work(work);
-        /* wait for the pending one */
-        if (pending) {
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
        }
-        return pending || waited;
-}
-EXPORT_SYMBOL_GPL(flush_work_sync);
-/*
- * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
- * so this work can't be re-armed in any way.
- */
-static int try_to_grab_pending(struct work_struct *work)
-{
-        struct global_cwq *gcwq;
-        int ret = -1;
-        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
-                return 0;
-        /*
-         * The queueing is in progress, or it is already queued. Try to
-         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
-         */
-        gcwq = get_work_gcwq(work);
-        if (!gcwq)
-                return ret;
-        spin_lock_irq(&gcwq->lock);
-        if (!list_empty(&work->entry)) {
-                /*
-                 * This work is queued, but perhaps we locked the wrong gcwq.
-                 * In that case we must see the new value after rmb(), see
-                 * insert_work()->wmb().
-                 */
-                smp_rmb();
-                if (gcwq == get_work_gcwq(work)) {
-                        debug_work_deactivate(work);
-                        list_del_init(&work->entry);
-                        cwq_dec_nr_in_flight(get_work_cwq(work),
-                                get_work_color(work),
-                                *work_data_bits(work) & WORK_STRUCT_DELAYED);
-                        ret = 1;
-                }
-        }
-        spin_unlock_irq(&gcwq->lock);
-        return ret;
 }
+EXPORT_SYMBOL_GPL(flush_work);
-static bool __cancel_work_timer(struct work_struct *work,
+static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
-                                struct timer_list* timer)
 {
+        unsigned long flags;
        int ret;
        do {
-                ret = (timer && likely(del_timer(timer)));
+                ret = try_to_grab_pending(work, is_dwork, &flags);
-                if (!ret)
+                /*
-                        ret = try_to_grab_pending(work);
+                 * If someone else is canceling, wait for the same event it
-                wait_on_work(work);
+                 * would be waiting for before retrying.
+                 */
+                if (unlikely(ret == -ENOENT))
+                        flush_work(work);
        } while (unlikely(ret < 0));
+        /* tell other tasks trying to grab @work to back off */
+        mark_work_canceling(work);
+        local_irq_restore(flags);
+        flush_work(work);
        clear_work_data(work);
        return ret;
 }
@@ -2892,7 +2929,7 @@ static bool __cancel_work_timer(struct work_struct *work,
 */
 bool cancel_work_sync(struct work_struct *work)
 {
-        return __cancel_work_timer(work, NULL);
+        return __cancel_work_timer(work, false);
 }
 EXPORT_SYMBOL_GPL(cancel_work_sync);
@@ -2910,33 +2947,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
 */
 bool flush_delayed_work(struct delayed_work *dwork)
 {
+        local_irq_disable();
        if (del_timer_sync(&dwork->timer))
-                __queue_work(raw_smp_processor_id(),
+                __queue_work(dwork->cpu,
                             get_work_cwq(&dwork->work)->wq, &dwork->work);
+        local_irq_enable();
        return flush_work(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work);
 /**
- * flush_delayed_work_sync - wait for a dwork to finish
+ * cancel_delayed_work - cancel a delayed work
- * @dwork: the delayed work to flush
+ * @dwork: delayed_work to cancel
 *
- * Delayed timer is cancelled and the pending work is queued for
+ * Kill off a pending delayed_work.  Returns %true if @dwork was pending
- * execution immediately.  Other than timer handling, its behavior
+ * and canceled; %false if wasn't pending.  Note that the work callback
- * is identical to flush_work_sync().
+ * function may still be running on return, unless it returns %true and the
+ * work doesn't re-arm itself.  Explicitly flush or use
+ * cancel_delayed_work_sync() to wait on it.
 *
- * RETURNS:
+ * This function is safe to call from any context including IRQ handler.
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
 */
-bool flush_delayed_work_sync(struct delayed_work *dwork)
+bool cancel_delayed_work(struct delayed_work *dwork)
 {
-        if (del_timer_sync(&dwork->timer))
+        unsigned long flags;
-                __queue_work(raw_smp_processor_id(),
+        int ret;
-                             get_work_cwq(&dwork->work)->wq, &dwork->work);
-        return flush_work_sync(&dwork->work);
+        do {
+                ret = try_to_grab_pending(&dwork->work, true, &flags);
+        } while (unlikely(ret == -EAGAIN));
+        if (unlikely(ret < 0))
+                return false;
+        set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
+        local_irq_restore(flags);
+        return true;
 }
-EXPORT_SYMBOL(flush_delayed_work_sync);
+EXPORT_SYMBOL(cancel_delayed_work);
 /**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
@@ -2949,54 +2997,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync);
 */
 bool cancel_delayed_work_sync(struct delayed_work *dwork)
 {
-        return __cancel_work_timer(&dwork->work, &dwork->timer);
+        return __cancel_work_timer(&dwork->work, true);
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 /**
- * schedule_work - put work task in global workqueue
- * @work: job to be done
- *
- * Returns zero if @work was already on the kernel-global workqueue and
- * non-zero otherwise.
- *
- * This puts a job in the kernel-global workqueue if it was not already
- * queued and leaves it in the same position on the kernel-global
- * workqueue otherwise.
- */
-int schedule_work(struct work_struct *work)
-{
-        return queue_work(system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work);
-/*
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
-int schedule_work_on(int cpu, struct work_struct *work)
+bool schedule_work_on(int cpu, struct work_struct *work)
 {
        return queue_work_on(cpu, system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work_on);
 /**
- * schedule_delayed_work - put work task in global workqueue after delay
+ * schedule_work - put work task in global workqueue
- * @dwork: job to be done
+ * @work: job to be done
- * @delay: number of jiffies to wait or 0 for immediate execution
 *
- * After waiting for a given time this puts a job in the kernel-global
+ * Returns %false if @work was already on the kernel-global workqueue and
- * workqueue.
+ * %true otherwise.
+ *
+ * This puts a job in the kernel-global workqueue if it was not already
+ * queued and leaves it in the same position on the kernel-global
+ * workqueue otherwise.
 */
-int schedule_delayed_work(struct delayed_work *dwork,
+bool schedule_work(struct work_struct *work)
-                                        unsigned long delay)
 {
-        return queue_delayed_work(system_wq, dwork, delay);
+        return queue_work(system_wq, work);
 }
-EXPORT_SYMBOL(schedule_delayed_work);
+EXPORT_SYMBOL(schedule_work);
 /**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
@@ -3007,14 +3040,28 @@ EXPORT_SYMBOL(schedule_delayed_work);
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
-int schedule_delayed_work_on(int cpu,
+bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-                        struct delayed_work *dwork, unsigned long delay)
+                              unsigned long delay)
 {
        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 /**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait or 0 for immediate execution
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
+bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
+{
+        return queue_delayed_work(system_wq, dwork, delay);
+}
+EXPORT_SYMBOL(schedule_delayed_work);
+/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
@@ -3161,9 +3208,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
        int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
        if (max_active < 1 || max_active > lim)
-                printk(KERN_WARNING "workqueue: max_active %d requested for %s "
+                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
-                       "is out of range, clamping between %d and %d\n",
+                        max_active, name, 1, lim);
-                       max_active, name, 1, lim);
        return clamp_val(max_active, 1, lim);
 }
@@ -3319,6 +3365,26 @@ void destroy_workqueue(struct workqueue_struct *wq)
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 /**
+ * cwq_set_max_active - adjust max_active of a cwq
+ * @cwq: target cpu_workqueue_struct
+ * @max_active: new max_active value.
+ *
+ * Set @cwq->max_active to @max_active and activate delayed works if
+ * increased.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
+{
+        cwq->max_active = max_active;
+        while (!list_empty(&cwq->delayed_works) &&
+               cwq->nr_active < cwq->max_active)
+                cwq_activate_first_delayed(cwq);
+}
+/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
@@ -3345,7 +3411,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
                if (!(wq->flags & WQ_FREEZABLE) ||
                    !(gcwq->flags & GCWQ_FREEZING))
-                        get_cwq(gcwq->cpu, wq)->max_active = max_active;
+                        cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -3440,23 +3506,23 @@ EXPORT_SYMBOL_GPL(work_busy);
 */
 /* claim manager positions of all pools */
-static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
+static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
 {
        struct worker_pool *pool;
        for_each_worker_pool(pool, gcwq)
-                mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
+                mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
        spin_lock_irq(&gcwq->lock);
 }
 /* release manager positions */
-static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
+static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
 {
        struct worker_pool *pool;
        spin_unlock_irq(&gcwq->lock);
        for_each_worker_pool(pool, gcwq)
-                mutex_unlock(&pool->manager_mutex);
+                mutex_unlock(&pool->assoc_mutex);
 }
 static void gcwq_unbind_fn(struct work_struct *work)
@@ -3469,7 +3535,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
        BUG_ON(gcwq->cpu != smp_processor_id());
-        gcwq_claim_management_and_lock(gcwq);
+        gcwq_claim_assoc_and_lock(gcwq);
        /*
         * We've claimed all manager positions.  Make all workers unbound
@@ -3486,7 +3552,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
        gcwq->flags |= GCWQ_DISASSOCIATED;
-        gcwq_release_management_and_unlock(gcwq);
+        gcwq_release_assoc_and_unlock(gcwq);
        /*
         * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3514,7 +3580,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
 * Workqueues should be brought up before normal priority CPU notifiers.
 * This will be registered high priority CPU notifier.
 */
-static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                                               unsigned long action,
                                               void *hcpu)
 {
@@ -3542,10 +3608,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
-                gcwq_claim_management_and_lock(gcwq);
+                gcwq_claim_assoc_and_lock(gcwq);
                gcwq->flags &= ~GCWQ_DISASSOCIATED;
                rebind_workers(gcwq);
-                gcwq_release_management_and_unlock(gcwq);
+                gcwq_release_assoc_and_unlock(gcwq);
                break;
        }
        return NOTIFY_OK;
@@ -3555,7 +3621,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 * Workqueues should be brought down after normal priority CPU notifiers.
 * This will be registered as low priority CPU notifier.
 */
-static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
                                                 unsigned long action,
                                                 void *hcpu)
 {
@@ -3566,7 +3632,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
        case CPU_DOWN_PREPARE:
                /* unbinding should happen on the local CPU */
                INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
-                schedule_work_on(cpu, &unbind_work);
+                queue_work_on(cpu, system_highpri_wq, &unbind_work);
                flush_work(&unbind_work);
                break;
        }
@@ -3735,11 +3801,7 @@ void thaw_workqueues(void)
                                continue;
                        /* restore max_active and repopulate worklist */
-                        cwq->max_active = wq->saved_max_active;
+                        cwq_set_max_active(cwq, wq->saved_max_active);
-                        while (!list_empty(&cwq->delayed_works) &&
-                               cwq->nr_active < cwq->max_active)
-                                cwq_activate_first_delayed(cwq);
                }
                for_each_worker_pool(pool, gcwq)
@@ -3759,8 +3821,12 @@ static int __init init_workqueues(void)
        unsigned int cpu;
        int i;
+        /* make sure we have enough bits for OFFQ CPU number */
+        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
+                     WORK_CPU_LAST);
        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-        cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
+        hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
        /* initialize gcwqs */
        for_each_gcwq_cpu(cpu) {
@@ -3786,11 +3852,9 @@ static int __init init_workqueues(void)
                        setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
                                    (unsigned long)pool);
-                        mutex_init(&pool->manager_mutex);
+                        mutex_init(&pool->assoc_mutex);
                        ida_init(&pool->worker_ida);
                }
-                init_waitqueue_head(&gcwq->rebind_hold);
        }
        /* create the initial worker */
@@ -3813,17 +3877,14 @@ static int __init init_workqueues(void)
        }
        system_wq = alloc_workqueue("events", 0, 0);
+        system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
        system_long_wq = alloc_workqueue("events_long", 0, 0);
-        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
-        system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
+        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
-                        WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
+               !system_unbound_wq || !system_freezable_wq);
-        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-               !system_unbound_wq || !system_freezable_wq ||
-                !system_nrt_freezable_wq);
        return 0;
 }
 early_initcall(init_workqueues);