12 files changed, 433 insertions, 257 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eb26e12c6c2a..eaee9de224bd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -115,8 +115,6 @@ obj-$(CONFIG_HAS_IOMEM) += memremap.o
 $(obj)/configs.o: $(obj)/config_data.h
-# config_data.h contains the same information as ikconfig.h but gzipped.
-# Info from config_data can be extracted from /proc/config*
 targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
diff --git a/kernel/audit.c b/kernel/audit.c
index 67b9fbd871be..91bff3c0b368 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -107,7 +107,6 @@ static u32	audit_rate_limit;
 * When set to zero, this means unlimited. */
 static u32      audit_backlog_limit = 64;
 #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
-static u32      audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
 static u32      audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
 /* The identity of the user shutting down the audit system. */
@@ -138,11 +137,18 @@ static DEFINE_SPINLOCK(audit_freelist_lock);
 static int         audit_freelist_count;
 static LIST_HEAD(audit_freelist);
-static struct sk_buff_head audit_skb_queue;
+/* queue msgs to send via kauditd_task */
-/* queue of skbs to send to auditd when/if it comes back */
+static struct sk_buff_head audit_queue;
-static struct sk_buff_head audit_skb_hold_queue;
+/* queue msgs due to temporary unicast send problems */
+static struct sk_buff_head audit_retry_queue;
+/* queue msgs waiting for new auditd connection */
+static struct sk_buff_head audit_hold_queue;
+/* queue servicing thread */
 static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+/* waitqueue for callers who are blocked on the audit backlog */
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
@@ -338,7 +344,7 @@ static int audit_set_backlog_limit(u32 limit)
 static int audit_set_backlog_wait_time(u32 timeout)
 {
        return audit_do_config_change("audit_backlog_wait_time",
-                                      &audit_backlog_wait_time_master, timeout);
+                                      &audit_backlog_wait_time, timeout);
 }
 static int audit_set_enabled(u32 state)
@@ -365,29 +371,10 @@ static int audit_set_failure(u32 state)
 }
 /*
- * Queue skbs to be sent to auditd when/if it comes back.  These skbs should
- * already have been sent via prink/syslog and so if these messages are dropped
- * it is not a huge concern since we already passed the audit_log_lost()
- * notification and stuff.  This is just nice to get audit messages during
- * boot before auditd is running or messages generated while auditd is stopped.
- * This only holds messages is audit_default is set, aka booting with audit=1
- * or building your kernel that way.
- */
-static void audit_hold_skb(struct sk_buff *skb)
-{
-        if (audit_default &&
-            (!audit_backlog_limit ||
-             skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
-                skb_queue_tail(&audit_skb_hold_queue, skb);
-        else
-                kfree_skb(skb);
-}
-/*
 * For one reason or another this nlh isn't getting delivered to the userspace
 * audit daemon, just send it to printk.
 */
-static void audit_printk_skb(struct sk_buff *skb)
+static void kauditd_printk_skb(struct sk_buff *skb)
 {
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
        char *data = nlmsg_data(nlh);
@@ -398,58 +385,123 @@ static void audit_printk_skb(struct sk_buff *skb)
                else
                        audit_log_lost("printk limit exceeded");
        }
+}
+/**
+ * kauditd_hold_skb - Queue an audit record, waiting for auditd
+ * @skb: audit record
+ *
+ * Description:
+ * Queue the audit record, waiting for an instance of auditd.  When this
+ * function is called we haven't given up yet on sending the record, but things
+ * are not looking good.  The first thing we want to do is try to write the
+ * record via printk and then see if we want to try and hold on to the record
+ * and queue it, if we have room.  If we want to hold on to the record, but we
+ * don't have room, record a record lost message.
+ */
+static void kauditd_hold_skb(struct sk_buff *skb)
+{
+        /* at this point it is uncertain if we will ever send this to auditd so
+         * try to send the message via printk before we go any further */
+        kauditd_printk_skb(skb);
+        /* can we just silently drop the message? */
+        if (!audit_default) {
+                kfree_skb(skb);
+                return;
+        }
+        /* if we have room, queue the message */
+        if (!audit_backlog_limit ||
+            skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
+                skb_queue_tail(&audit_hold_queue, skb);
+                return;
+        }
-        audit_hold_skb(skb);
+        /* we have no other options - drop the message */
+        audit_log_lost("kauditd hold queue overflow");
+        kfree_skb(skb);
 }
-static void kauditd_send_skb(struct sk_buff *skb)
+/**
+ * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
+ * @skb: audit record
+ *
+ * Description:
+ * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
+ * but for some reason we are having problems sending it audit records so
+ * queue the given record and attempt to resend.
+ */
+static void kauditd_retry_skb(struct sk_buff *skb)
 {
-        int err;
+        /* NOTE: because records should only live in the retry queue for a
-        int attempts = 0;
+         * short period of time, before either being sent or moved to the hold
-#define AUDITD_RETRIES 5
+         * queue, we don't currently enforce a limit on this queue */
+        skb_queue_tail(&audit_retry_queue, skb);
+}
+/**
+ * auditd_reset - Disconnect the auditd connection
+ *
+ * Description:
+ * Break the auditd/kauditd connection and move all the records in the retry
+ * queue into the hold queue in case auditd reconnects.  The audit_cmd_mutex
+ * must be held when calling this function.
+ */
+static void auditd_reset(void)
+{
+        struct sk_buff *skb;
+        /* break the connection */
+        if (audit_sock) {
+                sock_put(audit_sock);
+                audit_sock = NULL;
+        }
+        audit_pid = 0;
+        audit_nlk_portid = 0;
+        /* flush all of the retry queue to the hold queue */
+        while ((skb = skb_dequeue(&audit_retry_queue)))
+                kauditd_hold_skb(skb);
+}
+/**
+ * kauditd_send_unicast_skb - Send a record via unicast to auditd
+ * @skb: audit record
+ */
+static int kauditd_send_unicast_skb(struct sk_buff *skb)
+{
+        int rc;
-restart:
+        /* if we know nothing is connected, don't even try the netlink call */
-        /* take a reference in case we can't send it and we want to hold it */
+        if (!audit_pid)
+                return -ECONNREFUSED;
+        /* get an extra skb reference in case we fail to send */
        skb_get(skb);
-        err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+        rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
-        if (err < 0) {
+        if (rc >= 0) {
-                pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
-                       audit_pid, err);
-                if (audit_pid) {
-                        if (err == -ECONNREFUSED || err == -EPERM
-                            || ++attempts >= AUDITD_RETRIES) {
-                                char s[32];
-                                snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
-                                audit_log_lost(s);
-                                audit_pid = 0;
-                                audit_sock = NULL;
-                        } else {
-                                pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
-                                        attempts, audit_pid);
-                                set_current_state(TASK_INTERRUPTIBLE);
-                                schedule();
-                                goto restart;
-                        }
-                }
-                /* we might get lucky and get this in the next auditd */
-                audit_hold_skb(skb);
-        } else
-                /* drop the extra reference if sent ok */
                consume_skb(skb);
+                rc = 0;
+        }
+        return rc;
 }
 /*
- * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ * kauditd_send_multicast_skb - Send a record to any multicast listeners
+ * @skb: audit record
 *
+ * Description:
 * This function doesn't consume an skb as might be expected since it has to
 * copy it anyways.
 */
-static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
 {
-        struct sk_buff          *copy;
+        struct sk_buff *copy;
-        struct audit_net        *aunet = net_generic(&init_net, audit_net_id);
+        struct audit_net *aunet = net_generic(&init_net, audit_net_id);
-        struct sock             *sock = aunet->nlsk;
+        struct sock *sock = aunet->nlsk;
+        struct nlmsghdr *nlh;
        if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
                return;
@@ -464,74 +516,161 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
         * no reason for new multicast clients to continue with this
         * non-compliance.
         */
-        copy = skb_copy(skb, gfp_mask);
+        copy = skb_copy(skb, GFP_KERNEL);
        if (!copy)
                return;
+        nlh = nlmsg_hdr(copy);
+        nlh->nlmsg_len = skb->len;
-        nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
+        nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
 }
-/*
+/**
- * flush_hold_queue - empty the hold queue if auditd appears
+ * kauditd_wake_condition - Return true when it is time to wake kauditd_thread
- *
- * If auditd just started, drain the queue of messages already
- * sent to syslog/printk.  Remember loss here is ok.  We already
- * called audit_log_lost() if it didn't go out normally.  so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
 *
- * If you ever find kauditd to be too slow we can get a perf win
+ * Description:
- * by doing our own locking and keeping better track if there
+ * This function is for use by the wait_event_freezable() call in
- * are messages in this queue.  I don't see the need now, but
+ * kauditd_thread().
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
 */
-static void flush_hold_queue(void)
+static int kauditd_wake_condition(void)
 {
-        struct sk_buff *skb;
+        static int pid_last = 0;
+        int rc;
-        if (!audit_default || !audit_pid)
+        int pid = audit_pid;
-                return;
-        skb = skb_dequeue(&audit_skb_hold_queue);
-        if (likely(!skb))
-                return;
-        while (skb && audit_pid) {
+        /* wake on new messages or a change in the connected auditd */
-                kauditd_send_skb(skb);
+        rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last);
-                skb = skb_dequeue(&audit_skb_hold_queue);
+        if (rc)
-        }
+                pid_last = pid;
-        /*
+        return rc;
-         * if auditd just disappeared but we
-         * dequeued an skb we need to drop ref
-         */
-        consume_skb(skb);
 }
 static int kauditd_thread(void *dummy)
 {
+        int rc;
+        int auditd = 0;
+        int reschedule = 0;
+        struct sk_buff *skb;
+        struct nlmsghdr *nlh;
+#define UNICAST_RETRIES 5
+#define AUDITD_BAD(x,y) \
+        ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES)
+        /* NOTE: we do invalidate the auditd connection flag on any sending
+         * errors, but we only "restore" the connection flag at specific places
+         * in the loop in order to help ensure proper ordering of audit
+         * records */
        set_freezable();
        while (!kthread_should_stop()) {
-                struct sk_buff *skb;
+                /* NOTE: possible area for future improvement is to look at
+                 *       the hold and retry queues, since only this thread
-                flush_hold_queue();
+                 *       has access to these queues we might be able to do
+                 *       our own queuing and skip some/all of the locking */
+                /* NOTE: it might be a fun experiment to split the hold and
+                 *       retry queue handling to another thread, but the
+                 *       synchronization issues and other overhead might kill
+                 *       any performance gains */
+                /* attempt to flush the hold queue */
+                while (auditd && (skb = skb_dequeue(&audit_hold_queue))) {
+                        rc = kauditd_send_unicast_skb(skb);
+                        if (rc) {
+                                /* requeue to the same spot */
+                                skb_queue_head(&audit_hold_queue, skb);
+                                auditd = 0;
+                                if (AUDITD_BAD(rc, reschedule)) {
+                                        mutex_lock(&audit_cmd_mutex);
+                                        auditd_reset();
+                                        mutex_unlock(&audit_cmd_mutex);
+                                        reschedule = 0;
+                                }
+                        } else
+                                /* we were able to send successfully */
+                                reschedule = 0;
+                }
-                skb = skb_dequeue(&audit_skb_queue);
+                /* attempt to flush the retry queue */
+                while (auditd && (skb = skb_dequeue(&audit_retry_queue))) {
+                        rc = kauditd_send_unicast_skb(skb);
+                        if (rc) {
+                                auditd = 0;
+                                if (AUDITD_BAD(rc, reschedule)) {
+                                        kauditd_hold_skb(skb);
+                                        mutex_lock(&audit_cmd_mutex);
+                                        auditd_reset();
+                                        mutex_unlock(&audit_cmd_mutex);
+                                        reschedule = 0;
+                                } else
+                                        /* temporary problem (we hope), queue
+                                         * to the same spot and retry */
+                                        skb_queue_head(&audit_retry_queue, skb);
+                        } else
+                                /* we were able to send successfully */
+                                reschedule = 0;
+                }
+                /* standard queue processing, try to be as quick as possible */
+quick_loop:
+                skb = skb_dequeue(&audit_queue);
                if (skb) {
-                        if (!audit_backlog_limit ||
+                        /* setup the netlink header, see the comments in
-                            (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
+                         * kauditd_send_multicast_skb() for length quirks */
-                                wake_up(&audit_backlog_wait);
+                        nlh = nlmsg_hdr(skb);
-                        if (audit_pid)
+                        nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
-                                kauditd_send_skb(skb);
+                        /* attempt to send to any multicast listeners */
+                        kauditd_send_multicast_skb(skb);
+                        /* attempt to send to auditd, queue on failure */
+                        if (auditd) {
+                                rc = kauditd_send_unicast_skb(skb);
+                                if (rc) {
+                                        auditd = 0;
+                                        if (AUDITD_BAD(rc, reschedule)) {
+                                                mutex_lock(&audit_cmd_mutex);
+                                                auditd_reset();
+                                                mutex_unlock(&audit_cmd_mutex);
+                                                reschedule = 0;
+                                        }
+                                        /* move to the retry queue */
+                                        kauditd_retry_skb(skb);
+                                } else
+                                        /* everything is working so go fast! */
+                                        goto quick_loop;
+                        } else if (reschedule)
+                                /* we are currently having problems, move to
+                                 * the retry queue */
+                                kauditd_retry_skb(skb);
                        else
-                                audit_printk_skb(skb);
+                                /* dump the message via printk and hold it */
-                        continue;
+                                kauditd_hold_skb(skb);
-                }
+                } else {
+                        /* we have flushed the backlog so wake everyone */
+                        wake_up(&audit_backlog_wait);
+                        /* if everything is okay with auditd (if present), go
+                         * to sleep until there is something new in the queue
+                         * or we have a change in the connected auditd;
+                         * otherwise simply reschedule to give things a chance
+                         * to recover */
+                        if (reschedule) {
+                                set_current_state(TASK_INTERRUPTIBLE);
+                                schedule();
+                        } else
+                                wait_event_freezable(kauditd_wait,
+                                                     kauditd_wake_condition());
-                wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
+                        /* update the auditd connection status */
+                        auditd = (audit_pid ? 1 : 0);
+                }
        }
        return 0;
 }
@@ -596,6 +735,7 @@ static int audit_send_reply_thread(void *arg)
        kfree(reply);
        return 0;
 }
 /**
 * audit_send_reply - send an audit reply message via netlink
 * @request_skb: skb of request we are replying to (used to target the reply)
@@ -832,16 +972,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        if (err)
                return err;
-        /* As soon as there's any sign of userspace auditd,
-         * start kauditd to talk to it */
-        if (!kauditd_task) {
-                kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
-                if (IS_ERR(kauditd_task)) {
-                        err = PTR_ERR(kauditd_task);
-                        kauditd_task = NULL;
-                        return err;
-                }
-        }
        seq  = nlh->nlmsg_seq;
        data = nlmsg_data(nlh);
@@ -855,9 +985,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                s.rate_limit            = audit_rate_limit;
                s.backlog_limit         = audit_backlog_limit;
                s.lost                  = atomic_read(&audit_lost);
-                s.backlog               = skb_queue_len(&audit_skb_queue);
+                s.backlog               = skb_queue_len(&audit_queue);
                s.feature_bitmap        = AUDIT_FEATURE_BITMAP_ALL;
-                s.backlog_wait_time     = audit_backlog_wait_time_master;
+                s.backlog_wait_time     = audit_backlog_wait_time;
                audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
                break;
        }
@@ -897,9 +1027,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        }
                        if (audit_enabled != AUDIT_OFF)
                                audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
-                        audit_pid = new_pid;
+                        if (new_pid) {
-                        audit_nlk_portid = NETLINK_CB(skb).portid;
+                                if (audit_sock)
-                        audit_sock = skb->sk;
+                                        sock_put(audit_sock);
+                                audit_pid = new_pid;
+                                audit_nlk_portid = NETLINK_CB(skb).portid;
+                                sock_hold(skb->sk);
+                                audit_sock = skb->sk;
+                        } else {
+                                auditd_reset();
+                        }
+                        wake_up_interruptible(&kauditd_wait);
                }
                if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(s.rate_limit);
@@ -1167,10 +1305,10 @@ static void __net_exit audit_net_exit(struct net *net)
 {
        struct audit_net *aunet = net_generic(net, audit_net_id);
        struct sock *sock = aunet->nlsk;
-        if (sock == audit_sock) {
+        mutex_lock(&audit_cmd_mutex);
-                audit_pid = 0;
+        if (sock == audit_sock)
-                audit_sock = NULL;
+                auditd_reset();
-        }
+        mutex_unlock(&audit_cmd_mutex);
        netlink_kernel_release(sock);
        aunet->nlsk = NULL;
@@ -1195,17 +1333,24 @@ static int __init audit_init(void)
                audit_default ? "enabled" : "disabled");
        register_pernet_subsys(&audit_net_ops);
-        skb_queue_head_init(&audit_skb_queue);
+        skb_queue_head_init(&audit_queue);
-        skb_queue_head_init(&audit_skb_hold_queue);
+        skb_queue_head_init(&audit_retry_queue);
+        skb_queue_head_init(&audit_hold_queue);
        audit_initialized = AUDIT_INITIALIZED;
        audit_enabled = audit_default;
        audit_ever_enabled |= !!audit_default;
-        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                INIT_LIST_HEAD(&audit_inode_hash[i]);
+        kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+        if (IS_ERR(kauditd_task)) {
+                int err = PTR_ERR(kauditd_task);
+                panic("audit: failed to start the kauditd thread (%d)\n", err);
+        }
+        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
        return 0;
 }
 __initcall(audit_init);
@@ -1338,24 +1483,6 @@ static inline void audit_get_stamp(struct audit_context *ctx,
        }
 }
-/*
- * Wait for auditd to drain the queue a little
- */
-static long wait_for_auditd(long sleep_time)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        if (audit_backlog_limit &&
-            skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
-                add_wait_queue_exclusive(&audit_backlog_wait, &wait);
-                set_current_state(TASK_UNINTERRUPTIBLE);
-                sleep_time = schedule_timeout(sleep_time);
-                remove_wait_queue(&audit_backlog_wait, &wait);
-        }
-        return sleep_time;
-}
 /**
 * audit_log_start - obtain an audit buffer
 * @ctx: audit_context (may be NULL)
@@ -1374,12 +1501,9 @@ static long wait_for_auditd(long sleep_time)
 struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                                     int type)
 {
-        struct audit_buffer     *ab     = NULL;
+        struct audit_buffer *ab;
-        struct timespec         t;
+        struct timespec t;
-        unsigned int            uninitialized_var(serial);
+        unsigned int uninitialized_var(serial);
-        int reserve = 5; /* Allow atomic callers to go up to five
-                            entries over the normal backlog limit */
-        unsigned long timeout_start = jiffies;
        if (audit_initialized != AUDIT_INITIALIZED)
                return NULL;
@@ -1387,38 +1511,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
                return NULL;
-        if (gfp_mask & __GFP_DIRECT_RECLAIM) {
+        /* don't ever fail/sleep on these two conditions:
-                if (audit_pid && audit_pid == current->tgid)
+         * 1. auditd generated record - since we need auditd to drain the
-                        gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+         *    queue; also, when we are checking for auditd, compare PIDs using
-                else
+         *    task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
-                        reserve = 0;
+         *    using a PID anchored in the caller's namespace
-        }
+         * 2. audit command message - record types 1000 through 1099 inclusive
+         *    are command messages/records used to manage the kernel subsystem
-        while (audit_backlog_limit
+         *    and the audit userspace, blocking on these messages could cause
-               && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
+         *    problems under load so don't do it (note: not all of these
-                if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
+         *    command types are valid as record types, but it is quicker to
-                        long sleep_time;
+         *    just check two ints than a series of ints in a if/switch stmt) */
+        if (!((audit_pid && audit_pid == task_tgid_vnr(current)) ||
+              (type >= 1000 && type <= 1099))) {
+                long sleep_time = audit_backlog_wait_time;
+                while (audit_backlog_limit &&
+                       (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+                        /* wake kauditd to try and flush the queue */
+                        wake_up_interruptible(&kauditd_wait);
-                        sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
+                        /* sleep if we are allowed and we haven't exhausted our
-                        if (sleep_time > 0) {
+                         * backlog wait limit */
-                                sleep_time = wait_for_auditd(sleep_time);
+                        if ((gfp_mask & __GFP_DIRECT_RECLAIM) &&
-                                if (sleep_time > 0)
+                            (sleep_time > 0)) {
-                                        continue;
+                                DECLARE_WAITQUEUE(wait, current);
+                                add_wait_queue_exclusive(&audit_backlog_wait,
+                                                         &wait);
+                                set_current_state(TASK_UNINTERRUPTIBLE);
+                                sleep_time = schedule_timeout(sleep_time);
+                                remove_wait_queue(&audit_backlog_wait, &wait);
+                        } else {
+                                if (audit_rate_check() && printk_ratelimit())
+                                        pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+                                                skb_queue_len(&audit_queue),
+                                                audit_backlog_limit);
+                                audit_log_lost("backlog limit exceeded");
+                                return NULL;
                        }
                }
-                if (audit_rate_check() && printk_ratelimit())
-                        pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
-                                skb_queue_len(&audit_skb_queue),
-                                audit_backlog_limit);
-                audit_log_lost("backlog limit exceeded");
-                audit_backlog_wait_time = 0;
-                wake_up(&audit_backlog_wait);
-                return NULL;
        }
-        if (!reserve && !audit_backlog_wait_time)
-                audit_backlog_wait_time = audit_backlog_wait_time_master;
        ab = audit_buffer_alloc(ctx, gfp_mask, type);
        if (!ab) {
                audit_log_lost("out of memory in audit_log_start");
@@ -1426,9 +1560,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        }
        audit_get_stamp(ab->ctx, &t, &serial);
        audit_log_format(ab, "audit(%lu.%03lu:%u): ",
                         t.tv_sec, t.tv_nsec/1000000, serial);
        return ab;
 }
@@ -1978,10 +2112,10 @@ out:
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
 *
- * netlink_unicast() cannot be called inside an irq context because it blocks
+ * We can not do a netlink send inside an irq context because it blocks (last
- * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
+ * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
- * on a queue and a tasklet is scheduled to remove them from the queue outside
+ * queue and a tasklet is scheduled to remove them from the queue outside the
- * the irq context.  May be called in any context.
+ * irq context.  May be called in any context.
 */
 void audit_log_end(struct audit_buffer *ab)
 {
@@ -1990,28 +2124,8 @@ void audit_log_end(struct audit_buffer *ab)
        if (!audit_rate_check()) {
                audit_log_lost("rate limit exceeded");
        } else {
-                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+                skb_queue_tail(&audit_queue, ab->skb);
+                wake_up_interruptible(&kauditd_wait);
-                nlh->nlmsg_len = ab->skb->len;
-                kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
-                /*
-                 * The original kaudit unicast socket sends up messages with
-                 * nlmsg_len set to the payload length rather than the entire
-                 * message length.  This breaks the standard set by netlink.
-                 * The existing auditd daemon assumes this breakage.  Fixing
-                 * this would require co-ordinating a change in the established
-                 * protocol between the kaudit kernel subsystem and the auditd
-                 * userspace code.
-                 */
-                nlh->nlmsg_len -= NLMSG_HDRLEN;
-                if (audit_pid) {
-                        skb_queue_tail(&audit_skb_queue, ab->skb);
-                        wake_up_interruptible(&kauditd_wait);
-                } else {
-                        audit_printk_skb(ab->skb);
-                }
                ab->skb = NULL;
        }
        audit_buffer_free(ab);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index f84f8d06e1f6..f75154889aa9 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -130,10 +130,9 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
        ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return;
-        audit_log_format(ab, "auid=%u ses=%u op=",
+        audit_log_format(ab, "auid=%u ses=%u op=%s",
                         from_kuid(&init_user_ns, audit_get_loginuid(current)),
-                         audit_get_sessionid(current));
+                         audit_get_sessionid(current), op);
-        audit_log_string(ab, op);
        audit_log_format(ab, " path=");
        audit_log_untrustedstring(ab, audit_mark->path);
        audit_log_key(ab, rule->filterkey);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 25772476fa4a..055f11b0a50f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -458,8 +458,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return;
-        audit_log_format(ab, "op=");
+        audit_log_format(ab, "op=remove_rule");
-        audit_log_string(ab, "remove_rule");
        audit_log_format(ab, " dir=");
        audit_log_untrustedstring(ab, rule->tree->pathname);
        audit_log_key(ab, rule->filterkey);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0d302a87f21b..686e068ec3da 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -242,10 +242,9 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
                ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
                if (unlikely(!ab))
                        return;
-                audit_log_format(ab, "auid=%u ses=%u op=",
+                audit_log_format(ab, "auid=%u ses=%u op=%s",
                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
-                                 audit_get_sessionid(current));
+                                 audit_get_sessionid(current), op);
-                audit_log_string(ab, op);
                audit_log_format(ab, " path=");
                audit_log_untrustedstring(ab, w->path);
                audit_log_key(ab, r->filterkey);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 85d9cac497e4..880519d6cf2a 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -363,6 +363,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
        case AUDIT_EXIT:
        case AUDIT_SUCCESS:
        case AUDIT_INODE:
+        case AUDIT_SESSIONID:
                /* bit ops are only useful on syscall args */
                if (f->op == Audit_bitmask || f->op == Audit_bittest)
                        return -EINVAL;
@@ -476,6 +477,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                        if (!gid_valid(f->gid))
                                goto exit_free;
                        break;
+                case AUDIT_SESSIONID:
                case AUDIT_ARCH:
                        entry->rule.arch_f = f;
                        break;
@@ -1074,8 +1076,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
                return;
        audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
        audit_log_task_context(ab);
-        audit_log_format(ab, " op=");
+        audit_log_format(ab, " op=%s", action);
-        audit_log_string(ab, action);
        audit_log_key(ab, rule->filterkey);
        audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
        audit_log_end(ab);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2cd5256dbff7..cf1fa43512c1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -446,6 +446,7 @@ static int audit_filter_rules(struct task_struct *tsk,
        const struct cred *cred;
        int i, need_sid = 1;
        u32 sid;
+        unsigned int sessionid;
        cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
@@ -508,6 +509,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_FSGID:
                        result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
                        break;
+                case AUDIT_SESSIONID:
+                        sessionid = audit_get_sessionid(current);
+                        result = audit_comparator(sessionid, f->op, f->val);
+                        break;
                case AUDIT_PERS:
                        result = audit_comparator(tsk->personality, f->op, f->val);
                        break;
@@ -1000,7 +1005,7 @@ static void audit_log_execve_info(struct audit_context *context,
        long len_rem;
        long len_full;
        long len_buf;
-        long len_abuf;
+        long len_abuf = 0;
        long len_tmp;
        bool require_data;
        bool encode;
@@ -2025,8 +2030,11 @@ int audit_set_loginuid(kuid_t loginuid)
                goto out;
        /* are we setting or clearing? */
-        if (uid_valid(loginuid))
+        if (uid_valid(loginuid)) {
                sessionid = (unsigned int)atomic_inc_return(&session_id);
+                if (unlikely(sessionid == (unsigned int)-1))
+                        sessionid = (unsigned int)atomic_inc_return(&session_id);
+        }
        task->sessionid = sessionid;
        task->loginuid = loginuid;
diff --git a/kernel/capability.c b/kernel/capability.c
index 00411c82dac5..4984e1f552eb 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -457,6 +457,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
 EXPORT_SYMBOL(file_ns_capable);
 /**
+ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
+ * @ns: The user namespace in question
+ * @inode: The inode in question
+ *
+ * Return true if the inode uid and gid are within the namespace.
+ */
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+{
+        return kuid_has_mapping(ns, inode->i_uid) &&
+                kgid_has_mapping(ns, inode->i_gid);
+}
+/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @inode: The inode in question
 * @cap: The capability in question
@@ -469,7 +482,26 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
 {
        struct user_namespace *ns = current_user_ns();
-        return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+        return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
-                kgid_has_mapping(ns, inode->i_gid);
 }
 EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+        int ret = 0;  /* An absent tracer adds no restrictions */
+        const struct cred *cred;
+        rcu_read_lock();
+        cred = rcu_dereference(tsk->ptracer_cred);
+        if (cred)
+                ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+        rcu_read_unlock();
+        return (ret == 0);
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index a439ac429669..869b8ccc00bf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -747,7 +747,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif
 }
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+        struct user_namespace *user_ns)
 {
        mm->mmap = NULL;
        mm->mm_rb = RB_ROOT;
@@ -787,6 +788,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        if (init_new_context(p, mm))
                goto fail_nocontext;
+        mm->user_ns = get_user_ns(user_ns);
        return mm;
 fail_nocontext:
@@ -832,7 +834,7 @@ struct mm_struct *mm_alloc(void)
                return NULL;
        memset(mm, 0, sizeof(*mm));
-        return mm_init(mm, current);
+        return mm_init(mm, current, current_user_ns());
 }
 /*
@@ -847,6 +849,7 @@ void __mmdrop(struct mm_struct *mm)
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
        check_mm(mm);
+        put_user_ns(mm->user_ns);
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1128,7 +1131,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
-        if (!mm_init(mm, tsk))
+        if (!mm_init(mm, tsk, mm->user_ns))
                goto fail_nomem;
        err = dup_mmap(mm, oldmm);
diff --git a/kernel/padata.c b/kernel/padata.c
index 7848f0566403..05316c9f32da 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -64,15 +64,11 @@ static int padata_cpu_hash(struct parallel_data *pd)
 static void padata_parallel_worker(struct work_struct *parallel_work)
 {
        struct padata_parallel_queue *pqueue;
-        struct parallel_data *pd;
-        struct padata_instance *pinst;
        LIST_HEAD(local_list);
        local_bh_disable();
        pqueue = container_of(parallel_work,
                              struct padata_parallel_queue, work);
-        pd = pqueue->pd;
-        pinst = pd->pinst;
        spin_lock(&pqueue->parallel.lock);
        list_replace_init(&pqueue->parallel.list, &local_list);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e6474f7272ec..49ba7c1ade9d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -27,6 +27,35 @@
 #include <linux/cn_proc.h>
 #include <linux/compat.h>
+/*
+ * Access another process' address space via ptrace.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+                     void *buf, int len, unsigned int gup_flags)
+{
+        struct mm_struct *mm;
+        int ret;
+        mm = get_task_mm(tsk);
+        if (!mm)
+                return 0;
+        if (!tsk->ptrace ||
+            (current != tsk->parent) ||
+            ((get_dumpable(mm) != SUID_DUMP_USER) &&
+             !ptracer_capable(tsk, mm->user_ns))) {
+                mmput(mm);
+                return 0;
+        }
+        ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+        mmput(mm);
+        return ret;
+}
 /*
 * ptrace a task: make the debugger its new parent and
@@ -39,6 +68,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
        BUG_ON(!list_empty(&child->ptrace_entry));
        list_add(&child->ptrace_entry, &new_parent->ptraced);
        child->parent = new_parent;
+        rcu_read_lock();
+        child->ptracer_cred = get_cred(__task_cred(new_parent));
+        rcu_read_unlock();
 }
 /**
@@ -71,12 +103,16 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 */
 void __ptrace_unlink(struct task_struct *child)
 {
+        const struct cred *old_cred;
        BUG_ON(!child->ptrace);
        clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
        child->parent = child->real_parent;
        list_del_init(&child->ptrace_entry);
+        old_cred = child->ptracer_cred;
+        child->ptracer_cred = NULL;
+        put_cred(old_cred);
        spin_lock(&child->sighand->siglock);
        child->ptrace = 0;
@@ -220,7 +256,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
 static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
        const struct cred *cred = current_cred(), *tcred;
-        int dumpable = 0;
+        struct mm_struct *mm;
        kuid_t caller_uid;
        kgid_t caller_gid;
@@ -271,16 +307,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
        return -EPERM;
 ok:
        rcu_read_unlock();
-        smp_rmb();
+        mm = task->mm;
-        if (task->mm)
+        if (mm &&
-                dumpable = get_dumpable(task->mm);
+            ((get_dumpable(mm) != SUID_DUMP_USER) &&
-        rcu_read_lock();
+             !ptrace_has_cap(mm->user_ns, mode)))
-        if (dumpable != SUID_DUMP_USER &&
+            return -EPERM;
-            !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
-                rcu_read_unlock();
-                return -EPERM;
-        }
-        rcu_read_unlock();
        return security_ptrace_access_check(task, mode);
 }
@@ -344,10 +375,6 @@ static int ptrace_attach(struct task_struct *task, long request,
        if (seize)
                flags |= PT_SEIZED;
-        rcu_read_lock();
-        if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
-                flags |= PT_PTRACE_CAP;
-        rcu_read_unlock();
        task->ptrace = flags;
        __ptrace_link(task, current);
@@ -537,7 +564,8 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
                int this_len, retval;
                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
-                retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
+                retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);
                if (!retval) {
                        if (copied)
                                break;
@@ -564,7 +592,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
                if (copy_from_user(buf, src, this_len))
                        return -EFAULT;
-                retval = access_process_vm(tsk, dst, buf, this_len,
+                retval = ptrace_access_vm(tsk, dst, buf, this_len,
                                FOLL_FORCE | FOLL_WRITE);
                if (!retval) {
                        if (copied)
@@ -1128,7 +1156,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
        unsigned long tmp;
        int copied;
-        copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
+        copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
        if (copied != sizeof(tmp))
                return -EIO;
        return put_user(tmp, (unsigned long __user *)data);
@@ -1139,7 +1167,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 {
        int copied;
-        copied = access_process_vm(tsk, addr, &data, sizeof(data),
+        copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
                        FOLL_FORCE | FOLL_WRITE);
        return (copied == sizeof(data)) ? 0 : -EIO;
 }
@@ -1157,7 +1185,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
        switch (request) {
        case PTRACE_PEEKTEXT:
        case PTRACE_PEEKDATA:
-                ret = access_process_vm(child, addr, &word, sizeof(word),
+                ret = ptrace_access_vm(child, addr, &word, sizeof(word),
                                FOLL_FORCE);
                if (ret != sizeof(word))
                        ret = -EIO;
@@ -1167,7 +1195,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
        case PTRACE_POKETEXT:
        case PTRACE_POKEDATA:
-                ret = access_process_vm(child, addr, &data, sizeof(data),
+                ret = ptrace_access_vm(child, addr, &data, sizeof(data),
                                FOLL_FORCE | FOLL_WRITE);
                ret = (ret != sizeof(data) ? -EIO : 0);
                break;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index bff9c774987a..f7ce79a46050 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -41,8 +41,7 @@
 *         outside of a lifetime-guarded section.  In general, this
 *         is only needed for handling filters shared across tasks.
 * @prev: points to a previously installed, or inherited, filter
- * @len: the number of instructions in the program
+ * @prog: the BPF program to evaluate
- * @insnsi: the BPF program instructions to evaluate
 *
 * seccomp_filter objects are organized in a tree linked via the @prev
 * pointer.  For any task, it appears to be a singly-linked list starting
@@ -168,8 +167,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 }
 /**
- * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * seccomp_run_filters - evaluates all seccomp filters against @sd
- * @syscall: number of the current system call
+ * @sd: optional seccomp data to be passed to filters
 *
 * Returns valid seccomp BPF response codes.
 */