Merge remote-tracking branch 'linus/master' into x86/urgent

author: H. Peter Anvin <hpa@linux.intel.com> 2012-01-19 15:56:50 -0500
committer: H. Peter Anvin <hpa@linux.intel.com> 2012-01-19 15:56:50 -0500
commit: 282f445a779ed76fca9884fe377bf56a3088b208 (patch)
tree: d9abcf526baee0100672851e0a8894c19e762a39 /kernel
parent: 68f30fbee19cc67849b9fa8e153ede70758afe81 (diff)
parent: 90a4c0f51e8e44111a926be6f4c87af3938a79c3 (diff)
61 files changed, 2466 insertions, 1599 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f70396e5a24b..2d9de86b7e76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_irq_work.o = -pg
 endif
 obj-y += sched/
+obj-y += power/
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
-obj-$(CONFIG_PM) += power/
-obj-$(CONFIG_FREEZER) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 203dfead2e06..02e6167a53b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 * the cache line to have the data after getting the lock.
 */
 struct bsd_acct_struct {
-        volatile int            active;
+        int                     active;
-        volatile int            needcheck;
+        unsigned long           needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
-        struct timer_list       timer;
        struct list_head        list;
 };
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
 static LIST_HEAD(acct_list);
 /*
- * Called whenever the timer says to check the free space.
- */
-static void acct_timeout(unsigned long x)
-{
-        struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
-        acct->needcheck = 1;
-}
-/*
 * Check the amount of free space and suspend/resume accordingly.
 */
 static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
        struct kstatfs sbuf;
        int res;
        int act;
-        sector_t resume;
+        u64 resume;
-        sector_t suspend;
+        u64 suspend;
        spin_lock(&acct_lock);
        res = acct->active;
-        if (!file || !acct->needcheck)
+        if (!file || time_is_before_jiffies(acct->needcheck))
                goto out;
        spin_unlock(&acct_lock);
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
        suspend = sbuf.f_blocks * SUSPEND;
        resume = sbuf.f_blocks * RESUME;
-        sector_div(suspend, 100);
+        do_div(suspend, 100);
-        sector_div(resume, 100);
+        do_div(resume, 100);
        if (sbuf.f_bavail <= suspend)
                act = -1;
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
                }
        }
-        del_timer(&acct->timer);
+        acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-        acct->needcheck = 0;
-        acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-        add_timer(&acct->timer);
        res = acct->active;
 out:
        spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
        if (acct->file) {
                old_acct = acct->file;
                old_ns = acct->ns;
-                del_timer(&acct->timer);
                acct->active = 0;
-                acct->needcheck = 0;
                acct->file = NULL;
                acct->ns = NULL;
                list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
        if (file) {
                acct->file = file;
                acct->ns = ns;
-                acct->needcheck = 0;
+                acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
                acct->active = 1;
                list_add(&acct->list, &acct_list);
-                /* It's been deleted if it was used before so this is safe */
-                setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
-                acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-                add_timer(&acct->timer);
        }
        if (old_acct) {
                mnt_unpin(old_acct->f_path.mnt);
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
        spin_lock(&acct_lock);
 restart:
        list_for_each_entry(acct, &acct_list, list)
-                if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+                if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
                        acct_file_reopen(acct, NULL, NULL);
                        goto restart;
                }
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)
        if (acct == NULL)
                return;
-        del_timer_sync(&acct->timer);
        spin_lock(&acct_lock);
        if (acct->file != NULL)
                acct_file_reopen(acct, NULL, NULL);
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
         * Fill the accounting struct with the needed info as recorded
         * by the different kernel functions.
         */
-        memset((caddr_t)&ac, 0, sizeof(acct_t));
+        memset(&ac, 0, sizeof(acct_t));
        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
diff --git a/kernel/async.c b/kernel/async.c
index 80b74b88fefe..bd0c168a3bbe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
 static atomic_t entry_count;
-extern int initcall_debug;
 /*
 * MUST be called with the lock held!
diff --git a/kernel/audit.c b/kernel/audit.c
index 09fae2677a45..bb0eb5bb9a0a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
        case AUDIT_TTY_SET:
        case AUDIT_TRIM:
        case AUDIT_MAKE_EQUIV:
-                if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
+                if (!capable(CAP_AUDIT_CONTROL))
                        err = -EPERM;
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
-                if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
+                if (!capable(CAP_AUDIT_WRITE))
                        err = -EPERM;
                break;
        default:  /* bad msg */
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
        }
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-        audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+        audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
                         pid, uid, auid, ses);
        if (sid) {
                rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
                avail = audit_expand(ab,
                        max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
                if (!avail)
-                        goto out;
+                        goto out_va_end;
                len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
        }
-        va_end(args2);
        if (len > 0)
                skb_put(skb, len);
+out_va_end:
+        va_end(args2);
 out:
        return;
 }
@@ -1422,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
        char *p, *pathname;
        if (prefix)
-                audit_log_format(ab, " %s", prefix);
+                audit_log_format(ab, "%s", prefix);
        /* We will allow 11 spaces for ' (deleted)' to be appended */
        pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/kernel/audit.h b/kernel/audit.h
index 91e7071c4d2c..816766803371 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,12 +36,8 @@ enum audit_state {
        AUDIT_DISABLED,         /* Do not create per-task audit_context.
                                 * No syscall-specific audit records can
                                 * be generated. */
-        AUDIT_SETUP_CONTEXT,    /* Create the per-task audit_context,
-                                 * but don't necessarily fill it in at
-                                 * syscall entry time (i.e., filter
-                                 * instead). */
        AUDIT_BUILD_CONTEXT,    /* Create the per-task audit_context,
-                                 * and always fill it in at syscall
+                                 * and fill it in at syscall
                                 * entry time.  This makes a full
                                 * syscall record available if some
                                 * other part of the kernel decides it
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f8277c80d678..a6c3f1abd206 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
        switch(listnr) {
        default:
                goto exit_err;
-        case AUDIT_FILTER_USER:
-        case AUDIT_FILTER_TYPE:
 #ifdef CONFIG_AUDITSYSCALL
        case AUDIT_FILTER_ENTRY:
+                if (rule->action == AUDIT_ALWAYS)
+                        goto exit_err;
        case AUDIT_FILTER_EXIT:
        case AUDIT_FILTER_TASK:
 #endif
+        case AUDIT_FILTER_USER:
+        case AUDIT_FILTER_TYPE:
                ;
        }
        if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                                goto exit_free;
                        break;
                case AUDIT_FILETYPE:
-                        if ((f->val & ~S_IFMT) > S_IFMT)
+                        if (f->val & ~S_IFMT)
                                goto exit_free;
                        break;
                case AUDIT_INODE:
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                case AUDIT_ARG1:
                case AUDIT_ARG2:
                case AUDIT_ARG3:
+                case AUDIT_OBJ_UID:
+                case AUDIT_OBJ_GID:
                        break;
                case AUDIT_ARCH:
                        entry->rule.arch_f = f;
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                goto exit_free;
                        break;
                case AUDIT_FILTERKEY:
-                        err = -EINVAL;
                        if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
                                goto exit_free;
                        str = audit_unpack_string(&bufp, &remain, f->val);
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                goto exit_free;
                        break;
                case AUDIT_FILETYPE:
-                        if ((f->val & ~S_IFMT) > S_IFMT)
+                        if (f->val & ~S_IFMT)
+                                goto exit_free;
+                        break;
+                case AUDIT_FIELD_COMPARE:
+                        if (f->val > AUDIT_MAX_FIELD_COMPARE)
                                goto exit_free;
                        break;
                default:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 47b7fc1ea893..caaea6e944f8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,9 +70,15 @@
 #include "audit.h"
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
+ * for saving names from getname().  If we get more names we will allocate
-#define AUDIT_NAMES    20
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES     5
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
@@ -101,9 +107,8 @@ struct audit_cap_data {
 *
 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
 struct audit_names {
+        struct list_head list;          /* audit_context->names_list */
        const char      *name;
-        int             name_len;       /* number of name's characters to log */
-        unsigned        name_put;       /* call __putname() for this name */
        unsigned long   ino;
        dev_t           dev;
        umode_t         mode;
@@ -113,6 +118,14 @@ struct audit_names {
        u32             osid;
        struct audit_cap_data fcap;
        unsigned int    fcap_ver;
+        int             name_len;       /* number of name's characters to log */
+        bool            name_put;       /* call __putname() for this name */
+        /*
+         * This was an allocated audit_names and not from the array of
+         * names allocated in the task audit context.  Thus this name
+         * should be freed on syscall exit
+         */
+        bool            should_free;
 };
 struct audit_aux_data {
@@ -174,8 +187,17 @@ struct audit_context {
        long                return_code;/* syscall return code */
        u64                 prio;
        int                 return_valid; /* return code is valid */
-        int                 name_count;
+        /*
-        struct audit_names  names[AUDIT_NAMES];
+         * The names_list is the list of all audit_names collected during this
+         * syscall.  The first AUDIT_NAMES entries in the names_list will
+         * actually be from the preallocated_names array for performance
+         * reasons.  Except during allocation they should never be referenced
+         * through the preallocated_names array and should only be found/used
+         * by running the names_list.
+         */
+        struct audit_names  preallocated_names[AUDIT_NAMES];
+        int                 name_count; /* total records in names_list */
+        struct list_head    names_list; /* anchor for struct audit_names->list */
        char *              filterkey;  /* key for rule that triggered record */
        struct path         pwd;
        struct audit_context *previous; /* For nested syscalls */
@@ -210,12 +232,12 @@ struct audit_context {
                struct {
                        uid_t                   uid;
                        gid_t                   gid;
-                        mode_t                  mode;
+                        umode_t                 mode;
                        u32                     osid;
                        int                     has_perm;
                        uid_t                   perm_uid;
                        gid_t                   perm_gid;
-                        mode_t                  perm_mode;
+                        umode_t                 perm_mode;
                        unsigned long           qbytes;
                } ipc;
                struct {
@@ -234,7 +256,7 @@ struct audit_context {
                } mq_sendrecv;
                struct {
                        int                     oflag;
-                        mode_t                  mode;
+                        umode_t                 mode;
                        struct mq_attr          attr;
                } mq_open;
                struct {
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
        }
 }
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
 {
-        unsigned index = which & ~S_IFMT;
+        struct audit_names *n;
-        mode_t mode = which & S_IFMT;
+        umode_t mode = (umode_t)val;
        if (unlikely(!ctx))
                return 0;
-        if (index >= ctx->name_count)
+        list_for_each_entry(n, &ctx->names_list, list) {
-                return 0;
+                if ((n->ino != -1) &&
-        if (ctx->names[index].ino == -1)
+                    ((n->mode & S_IFMT) == mode))
-                return 0;
+                        return 1;
-        if ((ctx->names[index].mode ^ mode) & S_IFMT)
+        }
-                return 0;
-        return 1;
+        return 0;
 }
 /*
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
        return 0;
 }
+static int audit_compare_id(uid_t uid1,
+                            struct audit_names *name,
+                            unsigned long name_offset,
+                            struct audit_field *f,
+                            struct audit_context *ctx)
+{
+        struct audit_names *n;
+        unsigned long addr;
+        uid_t uid2;
+        int rc;
+        BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+        if (name) {
+                addr = (unsigned long)name;
+                addr += name_offset;
+                uid2 = *(uid_t *)addr;
+                rc = audit_comparator(uid1, f->op, uid2);
+                if (rc)
+                        return rc;
+        }
+        if (ctx) {
+                list_for_each_entry(n, &ctx->names_list, list) {
+                        addr = (unsigned long)n;
+                        addr += name_offset;
+                        uid2 = *(uid_t *)addr;
+                        rc = audit_comparator(uid1, f->op, uid2);
+                        if (rc)
+                                return rc;
+                }
+        }
+        return 0;
+}
+static int audit_field_compare(struct task_struct *tsk,
+                               const struct cred *cred,
+                               struct audit_field *f,
+                               struct audit_context *ctx,
+                               struct audit_names *name)
+{
+        switch (f->val) {
+        /* process to file object comparisons */
+        case AUDIT_COMPARE_UID_TO_OBJ_UID:
+                return audit_compare_id(cred->uid,
+                                        name, offsetof(struct audit_names, uid),
+                                        f, ctx);
+        case AUDIT_COMPARE_GID_TO_OBJ_GID:
+                return audit_compare_id(cred->gid,
+                                        name, offsetof(struct audit_names, gid),
+                                        f, ctx);
+        case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+                return audit_compare_id(cred->euid,
+                                        name, offsetof(struct audit_names, uid),
+                                        f, ctx);
+        case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+                return audit_compare_id(cred->egid,
+                                        name, offsetof(struct audit_names, gid),
+                                        f, ctx);
+        case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+                return audit_compare_id(tsk->loginuid,
+                                        name, offsetof(struct audit_names, uid),
+                                        f, ctx);
+        case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+                return audit_compare_id(cred->suid,
+                                        name, offsetof(struct audit_names, uid),
+                                        f, ctx);
+        case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+                return audit_compare_id(cred->sgid,
+                                        name, offsetof(struct audit_names, gid),
+                                        f, ctx);
+        case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+                return audit_compare_id(cred->fsuid,
+                                        name, offsetof(struct audit_names, uid),
+                                        f, ctx);
+        case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+                return audit_compare_id(cred->fsgid,
+                                        name, offsetof(struct audit_names, gid),
+                                        f, ctx);
+        /* uid comparisons */
+        case AUDIT_COMPARE_UID_TO_AUID:
+                return audit_comparator(cred->uid, f->op, tsk->loginuid);
+        case AUDIT_COMPARE_UID_TO_EUID:
+                return audit_comparator(cred->uid, f->op, cred->euid);
+        case AUDIT_COMPARE_UID_TO_SUID:
+                return audit_comparator(cred->uid, f->op, cred->suid);
+        case AUDIT_COMPARE_UID_TO_FSUID:
+                return audit_comparator(cred->uid, f->op, cred->fsuid);
+        /* auid comparisons */
+        case AUDIT_COMPARE_AUID_TO_EUID:
+                return audit_comparator(tsk->loginuid, f->op, cred->euid);
+        case AUDIT_COMPARE_AUID_TO_SUID:
+                return audit_comparator(tsk->loginuid, f->op, cred->suid);
+        case AUDIT_COMPARE_AUID_TO_FSUID:
+                return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+        /* euid comparisons */
+        case AUDIT_COMPARE_EUID_TO_SUID:
+                return audit_comparator(cred->euid, f->op, cred->suid);
+        case AUDIT_COMPARE_EUID_TO_FSUID:
+                return audit_comparator(cred->euid, f->op, cred->fsuid);
+        /* suid comparisons */
+        case AUDIT_COMPARE_SUID_TO_FSUID:
+                return audit_comparator(cred->suid, f->op, cred->fsuid);
+        /* gid comparisons */
+        case AUDIT_COMPARE_GID_TO_EGID:
+                return audit_comparator(cred->gid, f->op, cred->egid);
+        case AUDIT_COMPARE_GID_TO_SGID:
+                return audit_comparator(cred->gid, f->op, cred->sgid);
+        case AUDIT_COMPARE_GID_TO_FSGID:
+                return audit_comparator(cred->gid, f->op, cred->fsgid);
+        /* egid comparisons */
+        case AUDIT_COMPARE_EGID_TO_SGID:
+                return audit_comparator(cred->egid, f->op, cred->sgid);
+        case AUDIT_COMPARE_EGID_TO_FSGID:
+                return audit_comparator(cred->egid, f->op, cred->fsgid);
+        /* sgid comparison */
+        case AUDIT_COMPARE_SGID_TO_FSGID:
+                return audit_comparator(cred->sgid, f->op, cred->fsgid);
+        default:
+                WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
+                return 0;
+        }
+        return 0;
+}
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
 * otherwise.
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                              bool task_creation)
 {
        const struct cred *cred;
-        int i, j, need_sid = 1;
+        int i, need_sid = 1;
        u32 sid;
        cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
+                struct audit_names *n;
                int result = 0;
                switch (f->type) {
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMAJOR:
-                        if (name)
+                        if (name) {
-                                result = audit_comparator(MAJOR(name->dev),
+                                if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
-                                                          f->op, f->val);
+                                    audit_comparator(MAJOR(name->rdev), f->op, f->val))
-                        else if (ctx) {
+                                        ++result;
-                                for (j = 0; j < ctx->name_count; j++) {
+                        } else if (ctx) {
-                                        if (audit_comparator(MAJOR(ctx->names[j].dev),  f->op, f->val)) {
+                                list_for_each_entry(n, &ctx->names_list, list) {
+                                        if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+                                            audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMINOR:
-                        if (name)
+                        if (name) {
-                                result = audit_comparator(MINOR(name->dev),
+                                if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
-                                                          f->op, f->val);
+                                    audit_comparator(MINOR(name->rdev), f->op, f->val))
-                        else if (ctx) {
+                                        ++result;
-                                for (j = 0; j < ctx->name_count; j++) {
+                        } else if (ctx) {
-                                        if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+                                list_for_each_entry(n, &ctx->names_list, list) {
+                                        if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+                                            audit_comparator(MINOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk,
                        if (name)
                                result = (name->ino == f->val);
                        else if (ctx) {
-                                for (j = 0; j < ctx->name_count; j++) {
+                                list_for_each_entry(n, &ctx->names_list, list) {
-                                        if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+                                        if (audit_comparator(n->ino, f->op, f->val)) {
+                                                ++result;
+                                                break;
+                                        }
+                                }
+                        }
+                        break;
+                case AUDIT_OBJ_UID:
+                        if (name) {
+                                result = audit_comparator(name->uid, f->op, f->val);
+                        } else if (ctx) {
+                                list_for_each_entry(n, &ctx->names_list, list) {
+                                        if (audit_comparator(n->uid, f->op, f->val)) {
+                                                ++result;
+                                                break;
+                                        }
+                                }
+                        }
+                        break;
+                case AUDIT_OBJ_GID:
+                        if (name) {
+                                result = audit_comparator(name->gid, f->op, f->val);
+                        } else if (ctx) {
+                                list_for_each_entry(n, &ctx->names_list, list) {
+                                        if (audit_comparator(n->gid, f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                                                   name->osid, f->type, f->op,
                                                   f->lsm_rule, ctx);
                                } else if (ctx) {
-                                        for (j = 0; j < ctx->name_count; j++) {
+                                        list_for_each_entry(n, &ctx->names_list, list) {
-                                                if (security_audit_rule_match(
+                                                if (security_audit_rule_match(n->osid, f->type,
-                                                      ctx->names[j].osid,
+                                                                              f->op, f->lsm_rule,
-                                                      f->type, f->op,
+                                                                              ctx)) {
-                                                      f->lsm_rule, ctx)) {
                                                        ++result;
                                                        break;
                                                }
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_FILETYPE:
                        result = audit_match_filetype(ctx, f->val);
                        break;
+                case AUDIT_FIELD_COMPARE:
+                        result = audit_field_compare(tsk, cred, f, ctx, name);
+                        break;
                }
                if (!result)
                        return 0;
        }
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
        return AUDIT_BUILD_CONTEXT;
 }
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+                                   struct audit_names *n,
+                                   struct audit_context *ctx) {
+        int word, bit;
+        int h = audit_hash_ino((u32)n->ino);
+        struct list_head *list = &audit_inode_hash[h];
+        struct audit_entry *e;
+        enum audit_state state;
+        word = AUDIT_WORD(ctx->major);
+        bit  = AUDIT_BIT(ctx->major);
+        if (list_empty(list))
+                return 0;
+        list_for_each_entry_rcu(e, list, list) {
+                if ((e->rule.mask[word] & bit) == bit &&
+                    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+                        ctx->current_state = state;
+                        return 1;
+                }
+        }
+        return 0;
+}
+/* At syscall exit time, this filter is called if any audit_names have been
 * collected during syscall processing.  We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
 * Regarding audit_state, same rules apply as for audit_filter_syscall().
 */
 void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
-        int i;
+        struct audit_names *n;
-        struct audit_entry *e;
-        enum audit_state state;
        if (audit_pid && tsk->tgid == audit_pid)
                return;
        rcu_read_lock();
-        for (i = 0; i < ctx->name_count; i++) {
-                int word = AUDIT_WORD(ctx->major);
-                int bit  = AUDIT_BIT(ctx->major);
-                struct audit_names *n = &ctx->names[i];
-                int h = audit_hash_ino((u32)n->ino);
-                struct list_head *list = &audit_inode_hash[h];
-                if (list_empty(list))
-                        continue;
-                list_for_each_entry_rcu(e, list, list) {
+        list_for_each_entry(n, &ctx->names_list, list) {
-                        if ((e->rule.mask[word] & bit) == bit &&
+                if (audit_filter_inode_name(tsk, n, ctx))
-                            audit_filter_rules(tsk, &e->rule, ctx, n,
+                        break;
-                                               &state, false)) {
-                                rcu_read_unlock();
-                                ctx->current_state = state;
-                                return;
-                        }
-                }
        }
        rcu_read_unlock();
 }
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 {
        struct audit_context *context = tsk->audit_context;
-        if (likely(!context))
+        if (!context)
                return NULL;
        context->return_valid = return_valid;
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 static inline void audit_free_names(struct audit_context *context)
 {
-        int i;
+        struct audit_names *n, *next;
 #if AUDIT_DEBUG == 2
        if (context->put_count + context->ino_count != context->name_count) {
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context)
                       context->serial, context->major, context->in_syscall,
                       context->name_count, context->put_count,
                       context->ino_count);
-                for (i = 0; i < context->name_count; i++) {
+                list_for_each_entry(n, &context->names_list, list) {
                        printk(KERN_ERR "names[%d] = %p = %s\n", i,
-                               context->names[i].name,
+                               n->name, n->name ?: "(null)");
-                               context->names[i].name ?: "(null)");
                }
                dump_stack();
                return;
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context)
        context->ino_count  = 0;
 #endif
-        for (i = 0; i < context->name_count; i++) {
+        list_for_each_entry_safe(n, next, &context->names_list, list) {
-                if (context->names[i].name && context->names[i].name_put)
+                list_del(&n->list);
-                        __putname(context->names[i].name);
+                if (n->name && n->name_put)
+                        __putname(n->name);
+                if (n->should_free)
+                        kfree(n);
        }
        context->name_count = 0;
        path_put(&context->pwd);
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
                return NULL;
        audit_zero_context(context, state);
        INIT_LIST_HEAD(&context->killed_trees);
+        INIT_LIST_HEAD(&context->names_list);
        return context;
 }
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk)
                return 0; /* Return if not auditing. */
        state = audit_filter_task(tsk, &key);
-        if (likely(state == AUDIT_DISABLED))
+        if (state == AUDIT_DISABLED)
                return 0;
        if (!(context = audit_alloc_context(state))) {
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
                while (vma) {
                        if ((vma->vm_flags & VM_EXECUTABLE) &&
                            vma->vm_file) {
-                                audit_log_d_path(ab, "exe=",
+                                audit_log_d_path(ab, " exe=",
                                                 &vma->vm_file->f_path);
                                break;
                        }
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
                                  struct audit_buffer **ab,
                                  struct audit_aux_data_execve *axi)
 {
-        int i;
+        int i, len;
-        size_t len, len_sent = 0;
+        size_t len_sent = 0;
        const char __user *p;
        char *buf;
@@ -1249,7 +1445,7 @@ static void show_special(struct audit_context *context, int *call_panic)
        case AUDIT_IPC: {
                u32 osid = context->ipc.osid;
-                audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
+                audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
                         context->ipc.uid, context->ipc.gid, context->ipc.mode);
                if (osid) {
                        char *ctx = NULL;
@@ -1267,7 +1463,7 @@ static void show_special(struct audit_context *context, int *call_panic)
                        ab = audit_log_start(context, GFP_KERNEL,
                                             AUDIT_IPC_SET_PERM);
                        audit_log_format(ab,
-                                "qbytes=%lx ouid=%u ogid=%u mode=%#o",
+                                "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
                                context->ipc.qbytes,
                                context->ipc.perm_uid,
                                context->ipc.perm_gid,
@@ -1278,7 +1474,7 @@ static void show_special(struct audit_context *context, int *call_panic)
                break; }
        case AUDIT_MQ_OPEN: {
                audit_log_format(ab,
-                        "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+                        "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
                        "mq_msgsize=%ld mq_curmsgs=%ld",
                        context->mq_open.oflag, context->mq_open.mode,
                        context->mq_open.attr.mq_flags,
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic)
        audit_log_end(ab);
 }
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+                           int record_num, int *call_panic)
+{
+        struct audit_buffer *ab;
+        ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+        if (!ab)
+                return; /* audit_panic has been called */
+        audit_log_format(ab, "item=%d", record_num);
+        if (n->name) {
+                switch (n->name_len) {
+                case AUDIT_NAME_FULL:
+                        /* log the full path */
+                        audit_log_format(ab, " name=");
+                        audit_log_untrustedstring(ab, n->name);
+                        break;
+                case 0:
+                        /* name was specified as a relative path and the
+                         * directory component is the cwd */
+                        audit_log_d_path(ab, " name=", &context->pwd);
+                        break;
+                default:
+                        /* log the name's directory component */
+                        audit_log_format(ab, " name=");
+                        audit_log_n_untrustedstring(ab, n->name,
+                                                    n->name_len);
+                }
+        } else
+                audit_log_format(ab, " name=(null)");
+        if (n->ino != (unsigned long)-1) {
+                audit_log_format(ab, " inode=%lu"
+                                 " dev=%02x:%02x mode=%#ho"
+                                 " ouid=%u ogid=%u rdev=%02x:%02x",
+                                 n->ino,
+                                 MAJOR(n->dev),
+                                 MINOR(n->dev),
+                                 n->mode,
+                                 n->uid,
+                                 n->gid,
+                                 MAJOR(n->rdev),
+                                 MINOR(n->rdev));
+        }
+        if (n->osid != 0) {
+                char *ctx = NULL;
+                u32 len;
+                if (security_secid_to_secctx(
+                        n->osid, &ctx, &len)) {
+                        audit_log_format(ab, " osid=%u", n->osid);
+                        *call_panic = 2;
+                } else {
+                        audit_log_format(ab, " obj=%s", ctx);
+                        security_release_secctx(ctx, len);
+                }
+        }
+        audit_log_fcaps(ab, n);
+        audit_log_end(ab);
+}
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
        const struct cred *cred;
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
        const char *tty;
+        struct audit_names *n;
        /* tsk == current */
        context->pid = tsk->pid;
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        if (context->pwd.dentry && context->pwd.mnt) {
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
                if (ab) {
-                        audit_log_d_path(ab, "cwd=", &context->pwd);
+                        audit_log_d_path(ab, " cwd=", &context->pwd);
                        audit_log_end(ab);
                }
        }
-        for (i = 0; i < context->name_count; i++) {
-                struct audit_names *n = &context->names[i];
-                ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+        i = 0;
-                if (!ab)
+        list_for_each_entry(n, &context->names_list, list)
-                        continue; /* audit_panic has been called */
+                audit_log_name(context, n, i++, &call_panic);
-                audit_log_format(ab, "item=%d", i);
-                if (n->name) {
-                        switch(n->name_len) {
-                        case AUDIT_NAME_FULL:
-                                /* log the full path */
-                                audit_log_format(ab, " name=");
-                                audit_log_untrustedstring(ab, n->name);
-                                break;
-                        case 0:
-                                /* name was specified as a relative path and the
-                                 * directory component is the cwd */
-                                audit_log_d_path(ab, "name=", &context->pwd);
-                                break;
-                        default:
-                                /* log the name's directory component */
-                                audit_log_format(ab, " name=");
-                                audit_log_n_untrustedstring(ab, n->name,
-                                                            n->name_len);
-                        }
-                } else
-                        audit_log_format(ab, " name=(null)");
-                if (n->ino != (unsigned long)-1) {
-                        audit_log_format(ab, " inode=%lu"
-                                         " dev=%02x:%02x mode=%#o"
-                                         " ouid=%u ogid=%u rdev=%02x:%02x",
-                                         n->ino,
-                                         MAJOR(n->dev),
-                                         MINOR(n->dev),
-                                         n->mode,
-                                         n->uid,
-                                         n->gid,
-                                         MAJOR(n->rdev),
-                                         MINOR(n->rdev));
-                }
-                if (n->osid != 0) {
-                        char *ctx = NULL;
-                        u32 len;
-                        if (security_secid_to_secctx(
-                                n->osid, &ctx, &len)) {
-                                audit_log_format(ab, " osid=%u", n->osid);
-                                call_panic = 2;
-                        } else {
-                                audit_log_format(ab, " obj=%s", ctx);
-                                security_release_secctx(ctx, len);
-                        }
-                }
-                audit_log_fcaps(ab, n);
-                audit_log_end(ab);
-        }
        /* Send end of event record to help user space know we are finished */
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 *
 * Called from copy_process and do_exit
 */
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
 {
        struct audit_context *context;
        context = audit_get_context(tsk, 0, 0);
-        if (likely(!context))
+        if (!context)
                return;
        /* Check for system calls that do not go through the exit
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk)
 * will only be written if another part of the kernel requests that it
 * be written).
 */
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
                         unsigned long a1, unsigned long a2,
                         unsigned long a3, unsigned long a4)
 {
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major,
        struct audit_context *context = tsk->audit_context;
        enum audit_state     state;
-        if (unlikely(!context))
+        if (!context)
                return;
        /*
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major,
                context->prio = 0;
                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
        }
-        if (likely(state == AUDIT_DISABLED))
+        if (state == AUDIT_DISABLED)
                return;
        context->serial     = 0;
@@ -1658,30 +1861,9 @@ void audit_syscall_entry(int arch, int major,
        context->ppid       = 0;
 }
-void audit_finish_fork(struct task_struct *child)
-{
-        struct audit_context *ctx = current->audit_context;
-        struct audit_context *p = child->audit_context;
-        if (!p || !ctx)
-                return;
-        if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
-                return;
-        p->arch = ctx->arch;
-        p->major = ctx->major;
-        memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
-        p->ctime = ctx->ctime;
-        p->dummy = ctx->dummy;
-        p->in_syscall = ctx->in_syscall;
-        p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
-        p->ppid = current->pid;
-        p->prio = ctx->prio;
-        p->current_state = ctx->current_state;
-}
 /**
 * audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
+ * @pt_regs: syscall registers
- * @return_code: syscall return value
 *
 * Tear down after system call.  If the audit context has been marked as
 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1689,14 +1871,18 @@ void audit_finish_fork(struct task_struct *child)
 * message), then write out the syscall information.  In call cases,
 * free the names stored from getname().
 */
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
 {
        struct task_struct *tsk = current;
        struct audit_context *context;
-        context = audit_get_context(tsk, valid, return_code);
+        if (success)
+                success = AUDITSC_SUCCESS;
+        else
+                success = AUDITSC_FAILURE;
-        if (likely(!context))
+        context = audit_get_context(tsk, success, return_code);
+        if (!context)
                return;
        if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1821,6 +2007,30 @@ retry:
 #endif
 }
+static struct audit_names *audit_alloc_name(struct audit_context *context)
+{
+        struct audit_names *aname;
+        if (context->name_count < AUDIT_NAMES) {
+                aname = &context->preallocated_names[context->name_count];
+                memset(aname, 0, sizeof(*aname));
+        } else {
+                aname = kzalloc(sizeof(*aname), GFP_NOFS);
+                if (!aname)
+                        return NULL;
+                aname->should_free = true;
+        }
+        aname->ino = (unsigned long)-1;
+        list_add_tail(&aname->list, &context->names_list);
+        context->name_count++;
+#if AUDIT_DEBUG
+        context->ino_count++;
+#endif
+        return aname;
+}
 /**
 * audit_getname - add a name to the list
 * @name: name to add
@@ -1831,9 +2041,7 @@ retry:
 void __audit_getname(const char *name)
 {
        struct audit_context *context = current->audit_context;
+        struct audit_names *n;
-        if (IS_ERR(name) || !name)
-                return;
        if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
@@ -1843,13 +2051,15 @@ void __audit_getname(const char *name)
 #endif
                return;
        }
-        BUG_ON(context->name_count >= AUDIT_NAMES);
-        context->names[context->name_count].name = name;
+        n = audit_alloc_name(context);
-        context->names[context->name_count].name_len = AUDIT_NAME_FULL;
+        if (!n)
-        context->names[context->name_count].name_put = 1;
+                return;
-        context->names[context->name_count].ino  = (unsigned long)-1;
-        context->names[context->name_count].osid = 0;
+        n->name = name;
-        ++context->name_count;
+        n->name_len = AUDIT_NAME_FULL;
+        n->name_put = true;
        if (!context->pwd.dentry)
                get_fs_pwd(current->fs, &context->pwd);
 }
@@ -1871,12 +2081,13 @@ void audit_putname(const char *name)
                printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
                       __FILE__, __LINE__, context->serial, name);
                if (context->name_count) {
+                        struct audit_names *n;
                        int i;
-                        for (i = 0; i < context->name_count; i++)
+                        list_for_each_entry(n, &context->names_list, list)
                                printk(KERN_ERR "name[%d] = %p = %s\n", i,
-                                       context->names[i].name,
+                                       n->name, n->name ?: "(null)");
-                                       context->names[i].name ?: "(null)");
+                        }
-                }
 #endif
                __putname(name);
        }
@@ -1897,39 +2108,11 @@ void audit_putname(const char *name)
 #endif
 }
-static int audit_inc_name_count(struct audit_context *context,
-                                const struct inode *inode)
-{
-        if (context->name_count >= AUDIT_NAMES) {
-                if (inode)
-                        printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
-                               "dev=%02x:%02x, inode=%lu\n",
-                               MAJOR(inode->i_sb->s_dev),
-                               MINOR(inode->i_sb->s_dev),
-                               inode->i_ino);
-                else
-                        printk(KERN_DEBUG "name_count maxed, losing inode data\n");
-                return 1;
-        }
-        context->name_count++;
-#if AUDIT_DEBUG
-        context->ino_count++;
-#endif
-        return 0;
-}
 static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
 {
        struct cpu_vfs_cap_data caps;
        int rc;
-        memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
-        memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
-        name->fcap.fE = 0;
-        name->fcap_ver = 0;
        if (!dentry)
                return 0;
@@ -1969,30 +2152,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
 */
 void __audit_inode(const char *name, const struct dentry *dentry)
 {
-        int idx;
        struct audit_context *context = current->audit_context;
        const struct inode *inode = dentry->d_inode;
+        struct audit_names *n;
        if (!context->in_syscall)
                return;
-        if (context->name_count
-            && context->names[context->name_count-1].name
+        list_for_each_entry_reverse(n, &context->names_list, list) {
-            && context->names[context->name_count-1].name == name)
+                if (n->name && (n->name == name))
-                idx = context->name_count - 1;
+                        goto out;
-        else if (context->name_count > 1
-                 && context->names[context->name_count-2].name
-                 && context->names[context->name_count-2].name == name)
-                idx = context->name_count - 2;
-        else {
-                /* FIXME: how much do we care about inodes that have no
-                 * associated name? */
-                if (audit_inc_name_count(context, inode))
-                        return;
-                idx = context->name_count - 1;
-                context->names[idx].name = NULL;
        }
+        /* unable to find the name from a previous getname() */
+        n = audit_alloc_name(context);
+        if (!n)
+                return;
+out:
        handle_path(dentry);
-        audit_copy_inode(&context->names[idx], dentry, inode);
+        audit_copy_inode(n, dentry, inode);
 }
 /**
@@ -2011,11 +2189,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 void __audit_inode_child(const struct dentry *dentry,
                         const struct inode *parent)
 {
-        int idx;
        struct audit_context *context = current->audit_context;
        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
        const char *dname = dentry->d_name.name;
+        struct audit_names *n;
        int dirlen = 0;
        if (!context->in_syscall)
@@ -2025,9 +2203,7 @@ void __audit_inode_child(const struct dentry *dentry,
                handle_one(inode);
        /* parent is more likely, look for it first */
-        for (idx = 0; idx < context->name_count; idx++) {
+        list_for_each_entry(n, &context->names_list, list) {
-                struct audit_names *n = &context->names[idx];
                if (!n->name)
                        continue;
@@ -2040,9 +2216,7 @@ void __audit_inode_child(const struct dentry *dentry,
        }
        /* no matching parent, look for matching child */
-        for (idx = 0; idx < context->name_count; idx++) {
+        list_for_each_entry(n, &context->names_list, list) {
-                struct audit_names *n = &context->names[idx];
                if (!n->name)
                        continue;
@@ -2060,34 +2234,29 @@ void __audit_inode_child(const struct dentry *dentry,
 add_names:
        if (!found_parent) {
-                if (audit_inc_name_count(context, parent))
+                n = audit_alloc_name(context);
+                if (!n)
                        return;
-                idx = context->name_count - 1;
+                audit_copy_inode(n, NULL, parent);
-                context->names[idx].name = NULL;
-                audit_copy_inode(&context->names[idx], NULL, parent);
        }
        if (!found_child) {
-                if (audit_inc_name_count(context, inode))
+                n = audit_alloc_name(context);
+                if (!n)
                        return;
-                idx = context->name_count - 1;
                /* Re-use the name belonging to the slot for a matching parent
                 * directory. All names for this context are relinquished in
                 * audit_free_names() */
                if (found_parent) {
-                        context->names[idx].name = found_parent;
+                        n->name = found_parent;
-                        context->names[idx].name_len = AUDIT_NAME_FULL;
+                        n->name_len = AUDIT_NAME_FULL;
                        /* don't call __putname() */
-                        context->names[idx].name_put = 0;
+                        n->name_put = false;
-                } else {
-                        context->names[idx].name = NULL;
                }
                if (inode)
-                        audit_copy_inode(&context->names[idx], NULL, inode);
+                        audit_copy_inode(n, NULL, inode);
-                else
-                        context->names[idx].ino = (unsigned long)-1;
        }
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2121,19 +2290,28 @@ int auditsc_get_stamp(struct audit_context *ctx,
 static atomic_t session_id = ATOMIC_INIT(0);
 /**
- * audit_set_loginuid - set a task's audit_context loginuid
+ * audit_set_loginuid - set current task's audit_context loginuid
- * @task: task whose audit context is being modified
 * @loginuid: loginuid value
 *
 * Returns 0.
 *
 * Called (set) from fs/proc/base.c::proc_loginuid_write().
 */
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(uid_t loginuid)
 {
-        unsigned int sessionid = atomic_inc_return(&session_id);
+        struct task_struct *task = current;
        struct audit_context *context = task->audit_context;
+        unsigned int sessionid;
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+        if (task->loginuid != -1)
+                return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+        if (!capable(CAP_AUDIT_CONTROL))
+                return -EPERM;
+#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+        sessionid = atomic_inc_return(&session_id);
        if (context && context->in_syscall) {
                struct audit_buffer *ab;
@@ -2160,7 +2338,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 * @attr: queue attributes
 *
 */
-void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
+void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
        struct audit_context *context = current->audit_context;
@@ -2260,7 +2438,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 *
 * Called only after audit_ipc_obj().
 */
-void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
 {
        struct audit_context *context = current->audit_context;
@@ -2271,14 +2449,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mod
        context->ipc.has_perm = 1;
 }
-int audit_bprm(struct linux_binprm *bprm)
+int __audit_bprm(struct linux_binprm *bprm)
 {
        struct audit_aux_data_execve *ax;
        struct audit_context *context = current->audit_context;
-        if (likely(!audit_enabled || !context || context->dummy))
-                return 0;
        ax = kmalloc(sizeof(*ax), GFP_KERNEL);
        if (!ax)
                return -ENOMEM;
@@ -2299,13 +2474,10 @@ int audit_bprm(struct linux_binprm *bprm)
 * @args: args array
 *
 */
-void audit_socketcall(int nargs, unsigned long *args)
+void __audit_socketcall(int nargs, unsigned long *args)
 {
        struct audit_context *context = current->audit_context;
-        if (likely(!context || context->dummy))
-                return;
        context->type = AUDIT_SOCKETCALL;
        context->socketcall.nargs = nargs;
        memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2331,13 +2503,10 @@ void __audit_fd_pair(int fd1, int fd2)
 *
 * Returns 0 for success or NULL context or < 0 on error.
 */
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
 {
        struct audit_context *context = current->audit_context;
-        if (likely(!context || context->dummy))
-                return 0;
        if (!context->sockaddr) {
                void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
                if (!p)
@@ -2499,6 +2668,25 @@ void __audit_mmap_fd(int fd, int flags)
        context->type = AUDIT_MMAP;
 }
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+        uid_t auid, uid;
+        gid_t gid;
+        unsigned int sessionid;
+        auid = audit_get_loginuid(current);
+        sessionid = audit_get_sessionid(current);
+        current_uid_gid(&uid, &gid);
+        audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+                         auid, uid, gid, sessionid);
+        audit_log_task_context(ab);
+        audit_log_format(ab, " pid=%d comm=", current->pid);
+        audit_log_untrustedstring(ab, current->comm);
+        audit_log_format(ab, " reason=");
+        audit_log_string(ab, reason);
+        audit_log_format(ab, " sig=%ld", signr);
+}
 /**
 * audit_core_dumps - record information about processes that end abnormally
 * @signr: signal value
@@ -2509,10 +2697,6 @@ void __audit_mmap_fd(int fd, int flags)
 void audit_core_dumps(long signr)
 {
        struct audit_buffer *ab;
-        u32 sid;
-        uid_t auid = audit_get_loginuid(current), uid;
-        gid_t gid;
-        unsigned int sessionid = audit_get_sessionid(current);
        if (!audit_enabled)
                return;
@@ -2521,24 +2705,17 @@ void audit_core_dumps(long signr)
                return;
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-        current_uid_gid(&uid, &gid);
+        audit_log_abend(ab, "memory violation", signr);
-        audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+        audit_log_end(ab);
-                         auid, uid, gid, sessionid);
+}
-        security_task_getsecid(current, &sid);
-        if (sid) {
-                char *ctx = NULL;
-                u32 len;
-                if (security_secid_to_secctx(sid, &ctx, &len))
+void __audit_seccomp(unsigned long syscall)
-                        audit_log_format(ab, " ssid=%u", sid);
+{
-                else {
+        struct audit_buffer *ab;
-                        audit_log_format(ab, " subj=%s", ctx);
-                        security_release_secctx(ctx, len);
+        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-                }
+        audit_log_abend(ab, "seccomp", SIGKILL);
-        }
+        audit_log_format(ab, " syscall=%ld", syscall);
-        audit_log_format(ab, " pid=%d comm=", current->pid);
-        audit_log_untrustedstring(ab, current->comm);
-        audit_log_format(ab, " sig=%ld", signr);
        audit_log_end(ab);
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index b463871a4e69..3f1adb6c6470 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -287,74 +287,84 @@ error:
 }
 /**
- * has_capability - Does a task have a capability in init_user_ns
+ * has_ns_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
+ * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
+ * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
-bool has_capability(struct task_struct *t, int cap)
+bool has_ns_capability(struct task_struct *t,
+                       struct user_namespace *ns, int cap)
 {
-        int ret = security_real_capable(t, &init_user_ns, cap);
+        int ret;
+        rcu_read_lock();
+        ret = security_capable(__task_cred(t), ns, cap);
+        rcu_read_unlock();
        return (ret == 0);
 }
 /**
- * has_capability - Does a task have a capability in a specific user ns
+ * has_capability - Does a task have a capability in init_user_ns
 * @t: The task in question
- * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
- * currently in effect to the specified user namespace, false if not.
+ * currently in effect to the initial user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
-bool has_ns_capability(struct task_struct *t,
+bool has_capability(struct task_struct *t, int cap)
-                       struct user_namespace *ns, int cap)
 {
-        int ret = security_real_capable(t, ns, cap);
+        return has_ns_capability(t, &init_user_ns, cap);
-        return (ret == 0);
 }
 /**
- * has_capability_noaudit - Does a task have a capability (unaudited)
+ * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * in a specific user ns.
 * @t: The task in question
+ * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
- * currently in effect to init_user_ns, false if not.  Don't write an
+ * currently in effect to the specified user namespace, false if not.
- * audit message for the check.
+ * Do not write an audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
-bool has_capability_noaudit(struct task_struct *t, int cap)
+bool has_ns_capability_noaudit(struct task_struct *t,
+                               struct user_namespace *ns, int cap)
 {
-        int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+        int ret;
+        rcu_read_lock();
+        ret = security_capable_noaudit(__task_cred(t), ns, cap);
+        rcu_read_unlock();
        return (ret == 0);
 }
 /**
- * capable - Determine if the current task has a superior capability in effect
+ * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * initial user ns
+ * @t: The task in question
 * @cap: The capability to be tested for
 *
- * Return true if the current task has the given superior capability currently
+ * Return true if the specified task has the given superior capability
- * available for use, false if not.
+ * currently in effect to init_user_ns, false if not.  Don't write an
+ * audit message for the check.
 *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * Note that this does not set PF_SUPERPRIV on the task.
- * assumption that it's about to be used.
 */
-bool capable(int cap)
+bool has_capability_noaudit(struct task_struct *t, int cap)
 {
-        return ns_capable(&init_user_ns, cap);
+        return has_ns_capability_noaudit(t, &init_user_ns, cap);
 }
-EXPORT_SYMBOL(capable);
 /**
 * ns_capable - Determine if the current task has a superior capability in effect
@@ -374,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
                BUG();
        }
-        if (security_capable(ns, current_cred(), cap) == 0) {
+        if (security_capable(current_cred(), ns, cap) == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
@@ -383,18 +393,20 @@ bool ns_capable(struct user_namespace *ns, int cap)
 EXPORT_SYMBOL(ns_capable);
 /**
- * task_ns_capable - Determine whether current task has a superior
+ * capable - Determine if the current task has a superior capability in effect
- * capability targeted at a specific task's user namespace.
+ * @cap: The capability to be tested for
- * @t: The task whose user namespace is targeted.
+ *
- * @cap: The capability in question.
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
 *
- *  Return true if it does, false otherwise.
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
 */
-bool task_ns_capable(struct task_struct *t, int cap)
+bool capable(int cap)
 {
-        return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+        return ns_capable(&init_user_ns, cap);
 }
-EXPORT_SYMBOL(task_ns_capable);
+EXPORT_SYMBOL(capable);
 /**
 * nsown_capable - Check superior capability to one's own user_ns
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a184470cf9b5..a5d3b5325f77 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
 #include <linux/atomic.h>
+/*
+ * cgroup_mutex is the master lock.  Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
+ * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
+ * release_agent_path and so on.  Modifying requires both cgroup_mutex and
+ * cgroup_root_mutex.  Readers can acquire either of the two.  This is to
+ * break the following locking order cycle.
+ *
+ *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
+ *  B. namespace_sem -> cgroup_mutex
+ *
+ * B happens only through cgroup_show_options() and using cgroup_root_mutex
+ * breaks it.
+ */
 static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_MUTEX(cgroup_root_mutex);
 /*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 * -> cgroup_mkdir.
 */
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 static int alloc_css_id(struct cgroup_subsys *ss,
                        struct cgroup *parent, struct cgroup *child);
-static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
+static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 {
        struct inode *inode = new_inode(sb);
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 *
 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
 */
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
 static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
 {
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        int i;
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
+        BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
        removed_bits = root->actual_subsys_bits & ~final_bits;
        added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        return 0;
 }
-static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-        struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
        struct cgroup_subsys *ss;
-        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
        for_each_subsys(root, ss)
                seq_printf(seq, ",%s", ss->name);
        if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
                seq_printf(seq, ",name=%s", root->name);
-        mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&cgroup_root_mutex);
        return 0;
 }
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        /*
         * If the 'all' option was specified select all the subsystems,
-         * otherwise 'all, 'none' and a subsystem name options were not
+         * otherwise if 'none', 'name=' and a subsystem name options
-         * specified, let's default to 'all'
+         * were not specified, let's default to 'all'
         */
-        if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+        if (all_ss || (!one_ss && !opts->none && !opts->name)) {
                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
        /* See what subsystems are wanted */
        ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 out_unlock:
        kfree(opts.release_agent);
        kfree(opts.name);
+        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
        return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        int ret = 0;
        struct super_block *sb;
        struct cgroupfs_root *new_root;
+        struct inode *inode;
        /* First find the desired set of subsystems */
        mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* We used the new root structure, so this is a new hierarchy */
                struct list_head tmp_cg_links;
                struct cgroup *root_cgrp = &root->top_cgroup;
-                struct inode *inode;
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                mutex_lock(&inode->i_mutex);
                mutex_lock(&cgroup_mutex);
+                mutex_lock(&cgroup_root_mutex);
-                if (strlen(root->name)) {
+                /* Check for name clashes with existing mounts */
-                        /* Check for name clashes with existing mounts */
+                ret = -EBUSY;
-                        for_each_active_root(existing_root) {
+                if (strlen(root->name))
-                                if (!strcmp(existing_root->name, root->name)) {
+                        for_each_active_root(existing_root)
-                                        ret = -EBUSY;
+                                if (!strcmp(existing_root->name, root->name))
-                                        mutex_unlock(&cgroup_mutex);
+                                        goto unlock_drop;
-                                        mutex_unlock(&inode->i_mutex);
-                                        goto drop_new_super;
-                                }
-                        }
-                }
                /*
                 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 * have some link structures left over
                 */
                ret = allocate_cg_links(css_set_count, &tmp_cg_links);
-                if (ret) {
+                if (ret)
-                        mutex_unlock(&cgroup_mutex);
+                        goto unlock_drop;
-                        mutex_unlock(&inode->i_mutex);
-                        goto drop_new_super;
-                }
                ret = rebind_subsystems(root, root->subsys_bits);
                if (ret == -EBUSY) {
-                        mutex_unlock(&cgroup_mutex);
-                        mutex_unlock(&inode->i_mutex);
                        free_cg_links(&tmp_cg_links);
-                        goto drop_new_super;
+                        goto unlock_drop;
                }
                /*
                 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                cred = override_creds(&init_cred);
                cgroup_populate_dir(root_cgrp);
                revert_creds(cred);
+                mutex_unlock(&cgroup_root_mutex);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
        } else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        kfree(opts.name);
        return dget(sb->s_root);
+ unlock_drop:
+        mutex_unlock(&cgroup_root_mutex);
+        mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&inode->i_mutex);
 drop_new_super:
        deactivate_locked_super(sb);
 drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
        BUG_ON(!list_empty(&cgrp->sibling));
        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
        /* Rebind all subsystems back to the default hierarchy */
        ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
                root_count--;
        }
+        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 EXPORT_SYMBOL_GPL(cgroup_path);
 /*
+ * Control Group taskset
+ */
+struct task_and_cgroup {
+        struct task_struct      *task;
+        struct cgroup           *cgrp;
+};
+struct cgroup_taskset {
+        struct task_and_cgroup  single;
+        struct flex_array       *tc_array;
+        int                     tc_array_len;
+        int                     idx;
+        struct cgroup           *cur_cgrp;
+};
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
+{
+        if (tset->tc_array) {
+                tset->idx = 0;
+                return cgroup_taskset_next(tset);
+        } else {
+                tset->cur_cgrp = tset->single.cgrp;
+                return tset->single.task;
+        }
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset.  Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
+{
+        struct task_and_cgroup *tc;
+        if (!tset->tc_array || tset->idx >= tset->tc_array_len)
+                return NULL;
+        tc = flex_array_get(tset->tc_array, tset->idx++);
+        tset->cur_cgrp = tc->cgrp;
+        return tc->task;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+/**
+ * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
+ * @tset: taskset of interest
+ *
+ * Return the cgroup for the current (last returned) task of @tset.  This
+ * function must be preceded by either cgroup_taskset_first() or
+ * cgroup_taskset_next().
+ */
+struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
+{
+        return tset->cur_cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+/**
+ * cgroup_taskset_size - return the number of tasks in taskset
+ * @tset: taskset of interest
+ */
+int cgroup_taskset_size(struct cgroup_taskset *tset)
+{
+        return tset->tc_array ? tset->tc_array_len : 1;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_size);
+/*
 * cgroup_task_migrate - move a task from one cgroup to another.
 *
 * 'guarantee' is set if the caller promises that a new css_set for the task
 * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
 */
 static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
                               struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
        struct css_set *newcg;
        /*
-         * get old css_set. we need to take task_lock and refcount it, because
+         * We are synchronized through threadgroup_lock() against PF_EXITING
-         * an exiting task can change its css_set to init_css_set and drop its
+         * setting such that we can't race against cgroup_exit() changing the
-         * old one without taking cgroup_mutex.
+         * css_set to init_css_set and dropping the old one.
         */
-        task_lock(tsk);
+        WARN_ON_ONCE(tsk->flags & PF_EXITING);
        oldcg = tsk->cgroups;
-        get_css_set(oldcg);
-        task_unlock(tsk);
        /* locate or allocate a new css_set for this task. */
        if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
                might_sleep();
                /* find_css_set will give us newcg already referenced. */
                newcg = find_css_set(oldcg, cgrp);
-                if (!newcg) {
+                if (!newcg)
-                        put_css_set(oldcg);
                        return -ENOMEM;
-                }
        }
-        put_css_set(oldcg);
-        /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
        task_lock(tsk);
-        if (tsk->flags & PF_EXITING) {
-                task_unlock(tsk);
-                put_css_set(newcg);
-                return -ESRCH;
-        }
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 * @cgrp: the cgroup the task is attaching to
 * @tsk: the task to be attached
 *
- * Call holding cgroup_mutex. May take task_lock of
+ * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * the task 'tsk' during call.
+ * @tsk during call.
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct cgroupfs_root *root = cgrp->root;
+        struct cgroup_taskset tset = { };
+        /* @tsk either already exited or can't exit until the end */
+        if (tsk->flags & PF_EXITING)
+                return -ESRCH;
        /* Nothing to do if the task is already in that cgroup */
        oldcgrp = task_cgroup_from_root(tsk, root);
        if (cgrp == oldcgrp)
                return 0;
+        tset.single.task = tsk;
+        tset.single.cgrp = oldcgrp;
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, tsk);
+                        retval = ss->can_attach(ss, cgrp, &tset);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
-                if (ss->can_attach_task) {
-                        retval = ss->can_attach_task(cgrp, tsk);
-                        if (retval) {
-                                failed_ss = ss;
-                                goto out;
-                        }
-                }
        }
        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                goto out;
        for_each_subsys(root, ss) {
-                if (ss->pre_attach)
-                        ss->pre_attach(cgrp);
-                if (ss->attach_task)
-                        ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                        ss->attach(ss, cgrp, oldcgrp, tsk);
+                        ss->attach(ss, cgrp, &tset);
        }
        synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, tsk);
+                                ss->cancel_attach(ss, cgrp, &tset);
                }
        }
        return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
        read_lock(&css_set_lock);
        newcg = find_existing_css_set(cg, cgrp, template);
-        if (newcg)
-                get_css_set(newcg);
        read_unlock(&css_set_lock);
        /* doesn't exist at all? */
        if (!newcg)
                return false;
        /* see if it's already in the list */
-        list_for_each_entry(cg_entry, newcg_list, links) {
+        list_for_each_entry(cg_entry, newcg_list, links)
-                if (cg_entry->cg == newcg) {
+                if (cg_entry->cg == newcg)
-                        put_css_set(newcg);
                        return true;
-                }
-        }
        /* not found */
-        put_css_set(newcg);
        return false;
 }
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
 * @cgrp: the cgroup to attach to
 * @leader: the threadgroup leader task_struct of the group to be attached
 *
- * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * take task_lock of each thread in leader's threadgroup individually in turn.
+ * task_lock of each thread in leader's threadgroup individually in turn.
 */
-int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
        int retval, i, group_size;
        struct cgroup_subsys *ss, *failed_ss = NULL;
-        bool cancel_failed_ss = false;
        /* guaranteed to be initialized later, but the compiler needs this */
-        struct cgroup *oldcgrp = NULL;
        struct css_set *oldcg;
        struct cgroupfs_root *root = cgrp->root;
        /* threadgroup list cursor and array */
        struct task_struct *tsk;
+        struct task_and_cgroup *tc;
        struct flex_array *group;
+        struct cgroup_taskset tset = { };
        /*
         * we need to make sure we have css_sets for all the tasks we're
         * going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         * step 0: in order to do expensive, possibly blocking operations for
         * every thread, we cannot iterate the thread group list, since it needs
         * rcu or tasklist locked. instead, build an array of all threads in the
-         * group - threadgroup_fork_lock prevents new threads from appearing,
+         * group - group_rwsem prevents new threads from appearing, and if
-         * and if threads exit, this will just be an over-estimate.
+         * threads exit, this will just be an over-estimate.
         */
        group_size = get_nr_threads(leader);
        /* flex_array supports very large thread-groups better than kmalloc. */
-        group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+        group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
-                                 GFP_KERNEL);
        if (!group)
                return -ENOMEM;
        /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                retval = -EAGAIN;
                goto out_free_group_list;
        }
-        /* take a reference on each task in the group to go in the array. */
        tsk = leader;
        i = 0;
        do {
+                struct task_and_cgroup ent;
+                /* @tsk either already exited or can't exit until the end */
+                if (tsk->flags & PF_EXITING)
+                        continue;
                /* as per above, nr_threads may decrease, but not increase. */
                BUG_ON(i >= group_size);
-                get_task_struct(tsk);
                /*
                 * saying GFP_ATOMIC has no effect here because we did prealloc
                 * earlier, but it's good form to communicate our expectations.
                 */
-                retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+                ent.task = tsk;
+                ent.cgrp = task_cgroup_from_root(tsk, root);
+                /* nothing to do if this task is already in the cgroup */
+                if (ent.cgrp == cgrp)
+                        continue;
+                retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
                BUG_ON(retval != 0);
                i++;
        } while_each_thread(leader, tsk);
        /* remember the number of threads in the array for later. */
        group_size = i;
+        tset.tc_array = group;
+        tset.tc_array_len = group_size;
        read_unlock(&tasklist_lock);
+        /* methods shouldn't be called if no task is actually migrating */
+        retval = 0;
+        if (!group_size)
+                goto out_free_group_list;
        /*
         * step 1: check that we can legitimately attach to the cgroup.
         */
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, leader);
+                        retval = ss->can_attach(ss, cgrp, &tset);
                        if (retval) {
                                failed_ss = ss;
                                goto out_cancel_attach;
                        }
                }
-                /* a callback to be run on every thread in the threadgroup. */
-                if (ss->can_attach_task) {
-                        /* run on each task in the threadgroup. */
-                        for (i = 0; i < group_size; i++) {
-                                tsk = flex_array_get_ptr(group, i);
-                                retval = ss->can_attach_task(cgrp, tsk);
-                                if (retval) {
-                                        failed_ss = ss;
-                                        cancel_failed_ss = true;
-                                        goto out_cancel_attach;
-                                }
-                        }
-                }
        }
        /*
@@ -2091,67 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         */
        INIT_LIST_HEAD(&newcg_list);
        for (i = 0; i < group_size; i++) {
-                tsk = flex_array_get_ptr(group, i);
+                tc = flex_array_get(group, i);
-                /* nothing to do if this task is already in the cgroup */
+                oldcg = tc->task->cgroups;
-                oldcgrp = task_cgroup_from_root(tsk, root);
-                if (cgrp == oldcgrp)
+                /* if we don't already have it in the list get a new one */
-                        continue;
+                if (!css_set_check_fetched(cgrp, tc->task, oldcg,
-                /* get old css_set pointer */
+                                           &newcg_list)) {
-                task_lock(tsk);
-                oldcg = tsk->cgroups;
-                get_css_set(oldcg);
-                task_unlock(tsk);
-                /* see if the new one for us is already in the list? */
-                if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
-                        /* was already there, nothing to do. */
-                        put_css_set(oldcg);
-                } else {
-                        /* we don't already have it. get new one. */
                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
-                        put_css_set(oldcg);
                        if (retval)
                                goto out_list_teardown;
                }
        }
        /*
-         * step 3: now that we're guaranteed success wrt the css_sets, proceed
+         * step 3: now that we're guaranteed success wrt the css_sets,
-         * to move all tasks to the new cgroup, calling ss->attach_task for each
+         * proceed to move all tasks to the new cgroup.  There are no
-         * one along the way. there are no failure cases after here, so this is
+         * failure cases after here, so this is the commit point.
-         * the commit point.
         */
-        for_each_subsys(root, ss) {
-                if (ss->pre_attach)
-                        ss->pre_attach(cgrp);
-        }
        for (i = 0; i < group_size; i++) {
-                tsk = flex_array_get_ptr(group, i);
+                tc = flex_array_get(group, i);
-                /* leave current thread as it is if it's already there */
+                retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
-                oldcgrp = task_cgroup_from_root(tsk, root);
+                BUG_ON(retval);
-                if (cgrp == oldcgrp)
-                        continue;
-                /* if the thread is PF_EXITING, it can just get skipped. */
-                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
-                if (retval == 0) {
-                        /* attach each task to each subsystem */
-                        for_each_subsys(root, ss) {
-                                if (ss->attach_task)
-                                        ss->attach_task(cgrp, tsk);
-                        }
-                } else {
-                        BUG_ON(retval != -ESRCH);
-                }
        }
        /* nothing is sensitive to fork() after this point. */
        /*
-         * step 4: do expensive, non-thread-specific subsystem callbacks.
+         * step 4: do subsystem attach callbacks.
-         * TODO: if ever a subsystem needs to know the oldcgrp for each task
-         * being moved, this call will need to be reworked to communicate that.
         */
        for_each_subsys(root, ss) {
                if (ss->attach)
-                        ss->attach(ss, cgrp, oldcgrp, leader);
+                        ss->attach(ss, cgrp, &tset);
        }
        /*
@@ -2171,20 +2220,12 @@ out_cancel_attach:
        /* same deal as in cgroup_attach_task */
        if (retval) {
                for_each_subsys(root, ss) {
-                        if (ss == failed_ss) {
+                        if (ss == failed_ss)
-                                if (cancel_failed_ss && ss->cancel_attach)
-                                        ss->cancel_attach(ss, cgrp, leader);
                                break;
-                        }
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, leader);
+                                ss->cancel_attach(ss, cgrp, &tset);
                }
        }
-        /* clean up the array of referenced threads in the group. */
-        for (i = 0; i < group_size; i++) {
-                tsk = flex_array_get_ptr(group, i);
-                put_task_struct(tsk);
-        }
 out_free_group_list:
        flex_array_free(group);
        return retval;
@@ -2192,8 +2233,8 @@ out_free_group_list:
 /*
 * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will take
+ * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex; may take task_lock of task.
+ * cgroup_mutex and threadgroup; may take task_lock of task.
 */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
@@ -2220,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
                         * detect it later.
                         */
                        tsk = tsk->group_leader;
-                } else if (tsk->flags & PF_EXITING) {
-                        /* optimization for the single-task-only case */
-                        rcu_read_unlock();
-                        cgroup_unlock();
-                        return -ESRCH;
                }
                /*
                 * even if we're attaching all tasks in the thread group, we
                 * only need to check permissions on one of them.
@@ -2249,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
                get_task_struct(tsk);
        }
-        if (threadgroup) {
+        threadgroup_lock(tsk);
-                threadgroup_fork_write_lock(tsk);
+        if (threadgroup)
                ret = cgroup_attach_proc(cgrp, tsk);
-                threadgroup_fork_write_unlock(tsk);
+        else
-        } else {
                ret = cgroup_attach_task(cgrp, tsk);
-        }
+        threadgroup_unlock(tsk);
        put_task_struct(tsk);
        cgroup_unlock();
        return ret;
@@ -2306,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                return -EINVAL;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
+        mutex_lock(&cgroup_root_mutex);
        strcpy(cgrp->root->release_agent_path, buffer);
+        mutex_unlock(&cgroup_root_mutex);
        cgroup_unlock();
        return 0;
 }
@@ -2585,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file)
        return __d_cft(file->f_dentry);
 }
-static int cgroup_create_file(struct dentry *dentry, mode_t mode,
+static int cgroup_create_file(struct dentry *dentry, umode_t mode,
                                struct super_block *sb)
 {
        struct inode *inode;
@@ -2626,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 * @mode: mode to set on new directory.
 */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-                                mode_t mode)
+                                umode_t mode)
 {
        struct dentry *parent;
        int error = 0;
@@ -2653,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 * returns S_IRUGO if it has only a read handler
 * returns S_IWUSR if it has only a write hander
 */
-static mode_t cgroup_file_mode(const struct cftype *cft)
+static umode_t cgroup_file_mode(const struct cftype *cft)
 {
-        mode_t mode = 0;
+        umode_t mode = 0;
        if (cft->mode)
                return cft->mode;
@@ -2678,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp,
        struct dentry *dir = cgrp->dentry;
        struct dentry *dentry;
        int error;
-        mode_t mode;
+        umode_t mode;
        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -2789,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
 }
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
+        __acquires(css_set_lock)
 {
        /*
         * The first time anyone tries to iterate across a cgroup,
@@ -2828,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 }
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+        __releases(css_set_lock)
 {
        read_unlock(&css_set_lock);
 }
@@ -3752,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
 * Must be called with the mutex on the parent inode held
 */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-                             mode_t mode)
+                             umode_t mode)
 {
        struct cgroup *cgrp;
        struct cgroupfs_root *root = parent->root;
@@ -3846,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        return err;
 }
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
        struct cgroup *c_parent = dentry->d_parent->d_fsdata;
@@ -4491,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
 *
 * A pointer to the shared css_set was automatically copied in
 * fork.c by dup_task_struct().  However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
+ * it was not made under the protection of RCU, cgroup_mutex or
- * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
+ * threadgroup_change_begin(), so it might no longer be a valid
- * have already changed current->cgroups, allowing the previously
+ * cgroup pointer.  cgroup_attach_task() might have already changed
- * referenced cgroup group to be removed and freed.
+ * current->cgroups, allowing the previously referenced cgroup
+ * group to be removed and freed.
+ *
+ * Outside the pointer validity we also need to process the css_set
+ * inheritance between threadgoup_change_begin() and
+ * threadgoup_change_end(), this way there is no leak in any process
+ * wide migration performed by cgroup_attach_proc() that could otherwise
+ * miss a thread because it is too early or too late in the fork stage.
 *
 * At the point that cgroup_fork() is called, 'current' is the parent
 * task, and the passed argument 'child' points to the child task.
 */
 void cgroup_fork(struct task_struct *child)
 {
-        task_lock(current);
+        /*
+         * We don't need to task_lock() current because current->cgroups
+         * can't be changed concurrently here. The parent obviously hasn't
+         * exited and called cgroup_exit(), and we are synchronized against
+         * cgroup migration through threadgroup_change_begin().
+         */
        child->cgroups = current->cgroups;
        get_css_set(child->cgroups);
-        task_unlock(current);
        INIT_LIST_HEAD(&child->cg_list);
 }
@@ -4546,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
 {
        if (use_task_css_set_links) {
                write_lock(&css_set_lock);
-                task_lock(child);
+                if (list_empty(&child->cg_list)) {
-                if (list_empty(&child->cg_list))
+                        /*
+                         * It's safe to use child->cgroups without task_lock()
+                         * here because we are protected through
+                         * threadgroup_change_begin() against concurrent
+                         * css_set change in cgroup_task_migrate(). Also
+                         * the task can't exit at that point until
+                         * wake_up_new_task() is called, so we are protected
+                         * against cgroup_exit() setting child->cgroup to
+                         * init_css_set.
+                         */
                        list_add(&child->cg_list, &child->cgroups->tasks);
-                task_unlock(child);
+                }
                write_unlock(&css_set_lock);
        }
 }
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 213c0351dad8..fc0646b78a64 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
+bool cgroup_freezing(struct task_struct *task)
 {
-        enum freezer_state state = task_freezer(task)->state;
+        enum freezer_state state;
-        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+        bool ret;
-}
-int cgroup_freezing_or_frozen(struct task_struct *task)
+        rcu_read_lock();
-{
+        state = task_freezer(task)->state;
-        int result;
+        ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
-        task_lock(task);
+        rcu_read_unlock();
-        result = __cgroup_freezing_or_frozen(task);
-        task_unlock(task);
+        return ret;
-        return result;
 }
 /*
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;
 * freezer_can_attach():
 * cgroup_mutex (held by caller of can_attach)
 *
- * cgroup_freezing_or_frozen():
- * task->alloc_lock (to get task's cgroup)
- *
 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
 * freezer->lock
 *  sighand->siglock (if the cgroup is freezing)
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys;
 *   write_lock css_set_lock (cgroup iterator start)
 *    task->alloc_lock
 *   read_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
+ *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
 *     sighand->siglock
 */
 static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -150,7 +145,11 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
 static void freezer_destroy(struct cgroup_subsys *ss,
                            struct cgroup *cgroup)
 {
-        kfree(cgroup_freezer(cgroup));
+        struct freezer *freezer = cgroup_freezer(cgroup);
+        if (freezer->state != CGROUP_THAWED)
+                atomic_dec(&system_freezing_cnt);
+        kfree(freezer);
 }
 /* task is frozen or will freeze immediately when next it gets woken */
@@ -167,13 +166,17 @@ static bool is_task_frozen_enough(struct task_struct *task)
 */
 static int freezer_can_attach(struct cgroup_subsys *ss,
                              struct cgroup *new_cgroup,
-                              struct task_struct *task)
+                              struct cgroup_taskset *tset)
 {
        struct freezer *freezer;
+        struct task_struct *task;
        /*
         * Anything frozen can't move or be moved to/from.
         */
+        cgroup_taskset_for_each(task, new_cgroup, tset)
+                if (cgroup_freezing(task))
+                        return -EBUSY;
        freezer = cgroup_freezer(new_cgroup);
        if (freezer->state != CGROUP_THAWED)
@@ -182,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        return 0;
 }
-static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
-        rcu_read_lock();
-        if (__cgroup_freezing_or_frozen(tsk)) {
-                rcu_read_unlock();
-                return -EBUSY;
-        }
-        rcu_read_unlock();
-        return 0;
-}
 static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 {
        struct freezer *freezer;
@@ -220,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
        /* Locking avoids race with FREEZING -> THAWED transitions. */
        if (freezer->state == CGROUP_FREEZING)
-                freeze_task(task, true);
+                freeze_task(task);
        spin_unlock_irq(&freezer->lock);
 }
@@ -238,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup,
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
                ntotal++;
-                if (is_task_frozen_enough(task))
+                if (freezing(task) && is_task_frozen_enough(task))
                        nfrozen++;
        }
@@ -286,10 +278,9 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        struct task_struct *task;
        unsigned int num_cant_freeze_now = 0;
-        freezer->state = CGROUP_FREEZING;
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
-                if (!freeze_task(task, true))
+                if (!freeze_task(task))
                        continue;
                if (is_task_frozen_enough(task))
                        continue;
@@ -307,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        struct task_struct *task;
        cgroup_iter_start(cgroup, &it);
-        while ((task = cgroup_iter_next(cgroup, &it))) {
+        while ((task = cgroup_iter_next(cgroup, &it)))
-                thaw_process(task);
+                __thaw_task(task);
-        }
        cgroup_iter_end(cgroup, &it);
-        freezer->state = CGROUP_THAWED;
 }
 static int freezer_change_state(struct cgroup *cgroup,
@@ -326,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup,
        spin_lock_irq(&freezer->lock);
        update_if_frozen(cgroup, freezer);
-        if (goal_state == freezer->state)
-                goto out;
        switch (goal_state) {
        case CGROUP_THAWED:
+                if (freezer->state != CGROUP_THAWED)
+                        atomic_dec(&system_freezing_cnt);
+                freezer->state = CGROUP_THAWED;
                unfreeze_cgroup(cgroup, freezer);
                break;
        case CGROUP_FROZEN:
+                if (freezer->state == CGROUP_THAWED)
+                        atomic_inc(&system_freezing_cnt);
+                freezer->state = CGROUP_FREEZING;
                retval = try_to_freeze_cgroup(cgroup, freezer);
                break;
        default:
                BUG();
        }
-out:
        spin_unlock_irq(&freezer->lock);
        return retval;
@@ -388,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {
        .populate       = freezer_populate,
        .subsys_id      = freezer_subsys_id,
        .can_attach     = freezer_can_attach,
-        .can_attach_task = freezer_can_attach_task,
-        .pre_attach     = NULL,
-        .attach_task    = NULL,
-        .attach         = NULL,
        .fork           = freezer_fork,
-        .exit           = NULL,
 };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5ca38d5d238a..2060c6e57027 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -470,7 +470,7 @@ out:
        cpu_maps_update_done();
 }
-static int alloc_frozen_cpus(void)
+static int __init alloc_frozen_cpus(void)
 {
        if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
                return -ENOMEM;
@@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
 }
-int cpu_hotplug_pm_sync_init(void)
+static int __init cpu_hotplug_pm_sync_init(void)
 {
        pm_notifier(cpu_hotplug_pm_callback, 0);
        return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0b1712dba587..a09ac2b9a661 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1389,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                             struct task_struct *tsk)
-{
-        struct cpuset *cs = cgroup_cs(cont);
-        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-                return -ENOSPC;
-        /*
-         * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
-         * cannot change their cpu affinity and isolating such threads by their
-         * set of allowed nodes is unnecessary.  Thus, cpusets are not
-         * applicable for such threads.  This prevents checking for success of
-         * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
-         * be changed.
-         */
-        if (tsk->flags & PF_THREAD_BOUND)
-                return -EINVAL;
-        return 0;
-}
-static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
-{
-        return security_task_setscheduler(task);
-}
 /*
 * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in pre_attach, and they must
+ * dynamically allocating them is not allowed in can_attach, and they must
- * persist among pre_attach, attach_task, and attach.
+ * persist until attach.
 */
 static cpumask_var_t cpus_attach;
 static nodemask_t cpuset_attach_nodemask_from;
 static nodemask_t cpuset_attach_nodemask_to;
-/* Set-up work for before attaching each task. */
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static void cpuset_pre_attach(struct cgroup *cont)
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                             struct cgroup_taskset *tset)
 {
-        struct cpuset *cs = cgroup_cs(cont);
+        struct cpuset *cs = cgroup_cs(cgrp);
+        struct task_struct *task;
+        int ret;
+        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+                return -ENOSPC;
+        cgroup_taskset_for_each(task, cgrp, tset) {
+                /*
+                 * Kthreads bound to specific cpus cannot be moved to a new
+                 * cpuset; we cannot change their cpu affinity and
+                 * isolating such threads by their set of allowed nodes is
+                 * unnecessary.  Thus, cpusets are not applicable for such
+                 * threads.  This prevents checking for success of
+                 * set_cpus_allowed_ptr() on all attached tasks before
+                 * cpus_allowed may be changed.
+                 */
+                if (task->flags & PF_THREAD_BOUND)
+                        return -EINVAL;
+                if ((ret = security_task_setscheduler(task)))
+                        return ret;
+        }
+        /* prepare for attach */
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
                guarantee_online_cpus(cs, cpus_attach);
        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
-}
-/* Per-thread attachment work. */
-static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
-{
-        int err;
-        struct cpuset *cs = cgroup_cs(cont);
-        /*
+        return 0;
-         * can_attach beforehand should guarantee that this doesn't fail.
-         * TODO: have a better way to handle failure here
-         */
-        err = set_cpus_allowed_ptr(tsk, cpus_attach);
-        WARN_ON_ONCE(err);
-        cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
-        cpuset_update_task_spread_flag(cs, tsk);
 }
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                          struct cgroup *oldcont, struct task_struct *tsk)
+                          struct cgroup_taskset *tset)
 {
        struct mm_struct *mm;
-        struct cpuset *cs = cgroup_cs(cont);
+        struct task_struct *task;
-        struct cpuset *oldcs = cgroup_cs(oldcont);
+        struct task_struct *leader = cgroup_taskset_first(tset);
+        struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
+        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *oldcs = cgroup_cs(oldcgrp);
+        cgroup_taskset_for_each(task, cgrp, tset) {
+                /*
+                 * can_attach beforehand should guarantee that this doesn't
+                 * fail.  TODO: have a better way to handle failure here
+                 */
+                WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+                cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+                cpuset_update_task_spread_flag(cs, task);
+        }
        /*
         * Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1469,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
         */
        cpuset_attach_nodemask_from = oldcs->mems_allowed;
        cpuset_attach_nodemask_to = cs->mems_allowed;
-        mm = get_task_mm(tsk);
+        mm = get_task_mm(leader);
        if (mm) {
                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
                if (is_memory_migrate(cs))
@@ -1925,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {
        .create = cpuset_create,
        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
-        .can_attach_task = cpuset_can_attach_task,
-        .pre_attach = cpuset_pre_attach,
-        .attach_task = cpuset_attach_task,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
        .post_clone = cpuset_post_clone,
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3cd..e2ae7349437f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
                kdb_printf("%-20s%8u  0x%p ", mod->name,
                           mod->core_size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
-                kdb_printf("%4d ", module_refcount(mod));
+                kdb_printf("%4ld ", module_refcount(mod));
 #endif
                if (mod->state == MODULE_STATE_GOING)
                        kdb_printf(" (Unloading)");
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 890eb02c2f21..a8f4ac001a00 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
 */
@@ -6941,10 +6941,13 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
-static void
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
+                               struct cgroup_taskset *tset)
 {
-        task_function_call(task, __perf_cgroup_move, task);
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, cgrp, tset)
+                task_function_call(task, __perf_cgroup_move, task);
 }
 static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -6958,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
        if (!(task->flags & PF_EXITING))
                return;
-        perf_cgroup_attach_task(cgrp, task);
+        task_function_call(task, __perf_cgroup_move, task);
 }
 struct cgroup_subsys perf_subsys = {
@@ -6967,6 +6970,6 @@ struct cgroup_subsys perf_subsys = {
        .create         = perf_cgroup_create,
        .destroy        = perf_cgroup_destroy,
        .exit           = perf_cgroup_exit,
-        .attach_task    = perf_cgroup_attach_task,
+        .attach         = perf_cgroup_attach,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 7f3011c6b57f..6ddaba43fb7a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
 */
diff --git a/kernel/exit.c b/kernel/exit.c
index d579a459309d..294b1709170d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
+#include <linux/writeback.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -679,8 +680,6 @@ static void exit_mm(struct task_struct * tsk)
        tsk->mm = NULL;
        up_read(&mm->mmap_sem);
        enter_lazy_tlb(mm, current);
-        /* We don't want this task to be frozen prematurely */
-        clear_freeze_flag(tsk);
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
@@ -888,7 +887,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
@@ -965,8 +964,7 @@ NORET_TYPE void do_exit(long code)
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
-        if (unlikely(tsk->audit_context))
+        audit_free(tsk);
-                audit_free(tsk);
        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
@@ -1037,9 +1035,12 @@ NORET_TYPE void do_exit(long code)
        validate_creds_for_do_exit(tsk);
        preempt_disable();
+        if (tsk->nr_dirtied)
+                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
        /* causes final put_task_struct in finish_task_switch(). */
        tsk->state = TASK_DEAD;
+        tsk->flags |= PF_NOFREEZE;      /* tell freezer to ignore us */
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
@@ -1049,7 +1050,7 @@ NORET_TYPE void do_exit(long code)
 EXPORT_SYMBOL_GPL(do_exit);
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
 {
        if (comp)
                complete(comp);
@@ -1068,7 +1069,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
 */
-NORET_TYPE void
+void
 do_group_exit(int exit_code)
 {
        struct signal_struct *sig = current->signal;
diff --git a/kernel/fork.c b/kernel/fork.c
index b058c5820ecd..051f090d40c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
 #include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
@@ -870,6 +873,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
        struct io_context *ioc = current->io_context;
+        struct io_context *new_ioc;
        if (!ioc)
                return 0;
@@ -881,11 +885,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
                if (unlikely(!tsk->io_context))
                        return -ENOMEM;
        } else if (ioprio_valid(ioc->ioprio)) {
-                tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
+                new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
-                if (unlikely(!tsk->io_context))
+                if (unlikely(!new_ioc))
                        return -ENOMEM;
-                tsk->io_context->ioprio = ioc->ioprio;
+                new_ioc->ioprio = ioc->ioprio;
+                put_io_context(new_ioc, NULL);
        }
 #endif
        return 0;
@@ -972,7 +977,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sched_autogroup_fork(sig);
 #ifdef CONFIG_CGROUPS
-        init_rwsem(&sig->threadgroup_fork_lock);
+        init_rwsem(&sig->group_rwsem);
 #endif
        sig->oom_adj = current->signal->oom_adj;
@@ -992,7 +997,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
        new_flags |= PF_FORKNOEXEC;
        new_flags |= PF_STARTING;
        p->flags = new_flags;
-        clear_freeze_flag(p);
 }
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1154,7 +1158,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->io_context = NULL;
        p->audit_context = NULL;
        if (clone_flags & CLONE_THREAD)
-                threadgroup_fork_read_lock(current);
+                threadgroup_change_begin(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1292,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+        p->dirty_paused_when = 0;
        /*
         * Ok, make it visible to the rest of the system.
@@ -1369,8 +1374,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        proc_fork_connector(p);
        cgroup_post_fork(p);
        if (clone_flags & CLONE_THREAD)
-                threadgroup_fork_read_unlock(current);
+                threadgroup_change_end(current);
        perf_event_fork(p);
+        trace_task_newtask(p, clone_flags);
        return p;
 bad_fork_free_pid:
@@ -1404,7 +1412,7 @@ bad_fork_cleanup_policy:
 bad_fork_cleanup_cgroup:
 #endif
        if (clone_flags & CLONE_THREAD)
-                threadgroup_fork_read_unlock(current);
+                threadgroup_change_end(current);
        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
@@ -1519,8 +1527,6 @@ long do_fork(unsigned long clone_flags,
                        init_completion(&vfork);
                }
-                audit_finish_fork(p);
                /*
                 * We set PF_STARTING at creation in case tracing wants to
                 * use this to distinguish a fully live task from one that
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7be56c534397..9815b8d1eed5 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -9,101 +9,114 @@
 #include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
+#include <linux/kthread.h>
-/*
+/* total number of freezing conditions in effect */
- * freezing is complete, mark current process as frozen
+atomic_t system_freezing_cnt = ATOMIC_INIT(0);
+EXPORT_SYMBOL(system_freezing_cnt);
+/* indicate whether PM freezing is in effect, protected by pm_mutex */
+bool pm_freezing;
+bool pm_nosig_freezing;
+/* protects freezing and frozen transitions */
+static DEFINE_SPINLOCK(freezer_lock);
+/**
+ * freezing_slow_path - slow path for testing whether a task needs to be frozen
+ * @p: task to be tested
+ *
+ * This function is called by freezing() if system_freezing_cnt isn't zero
+ * and tests whether @p needs to enter and stay in frozen state.  Can be
+ * called under any context.  The freezers are responsible for ensuring the
+ * target tasks see the updated state.
 */
-static inline void frozen_process(void)
+bool freezing_slow_path(struct task_struct *p)
 {
-        if (!unlikely(current->flags & PF_NOFREEZE)) {
+        if (p->flags & PF_NOFREEZE)
-                current->flags |= PF_FROZEN;
+                return false;
-                smp_wmb();
-        }
+        if (pm_nosig_freezing || cgroup_freezing(p))
-        clear_freeze_flag(current);
+                return true;
+        if (pm_freezing && !(p->flags & PF_KTHREAD))
+                return true;
+        return false;
 }
+EXPORT_SYMBOL(freezing_slow_path);
 /* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(void)
+bool __refrigerator(bool check_kthr_stop)
 {
        /* Hmm, should we be allowed to suspend when there are realtime
           processes around? */
-        long save;
+        bool was_frozen = false;
+        long save = current->state;
-        task_lock(current);
-        if (freezing(current)) {
-                frozen_process();
-                task_unlock(current);
-        } else {
-                task_unlock(current);
-                return;
-        }
-        save = current->state;
        pr_debug("%s entered refrigerator\n", current->comm);
-        spin_lock_irq(&current->sighand->siglock);
-        recalc_sigpending(); /* We sent fake signal, clean it up */
-        spin_unlock_irq(&current->sighand->siglock);
-        /* prevent accounting of that task to load */
-        current->flags |= PF_FREEZING;
        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
-                if (!frozen(current))
+                spin_lock_irq(&freezer_lock);
+                current->flags |= PF_FROZEN;
+                if (!freezing(current) ||
+                    (check_kthr_stop && kthread_should_stop()))
+                        current->flags &= ~PF_FROZEN;
+                spin_unlock_irq(&freezer_lock);
+                if (!(current->flags & PF_FROZEN))
                        break;
+                was_frozen = true;
                schedule();
        }
-        /* Remove the accounting blocker */
-        current->flags &= ~PF_FREEZING;
        pr_debug("%s left refrigerator\n", current->comm);
-        __set_current_state(save);
+        /*
+         * Restore saved task state before returning.  The mb'd version
+         * needs to be used; otherwise, it might silently break
+         * synchronization which depends on ordered task state change.
+         */
+        set_current_state(save);
+        return was_frozen;
 }
-EXPORT_SYMBOL(refrigerator);
+EXPORT_SYMBOL(__refrigerator);
 static void fake_signal_wake_up(struct task_struct *p)
 {
        unsigned long flags;
-        spin_lock_irqsave(&p->sighand->siglock, flags);
+        if (lock_task_sighand(p, &flags)) {
-        signal_wake_up(p, 0);
+                signal_wake_up(p, 0);
-        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                unlock_task_sighand(p, &flags);
+        }
 }
 /**
- *      freeze_task - send a freeze request to given task
+ * freeze_task - send a freeze request to given task
- *      @p: task to send the request to
+ * @p: task to send the request to
- *      @sig_only: if set, the request will only be sent if the task has the
+ *
- *              PF_FREEZER_NOSIG flag unset
+ * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
- *      Return value: 'false', if @sig_only is set and the task has
+ * flag and either sending a fake signal to it or waking it up, depending
- *              PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
+ * on whether it has %PF_FREEZER_NOSIG set.
 *
- *      The freeze request is sent by setting the tasks's TIF_FREEZE flag and
+ * RETURNS:
- *      either sending a fake signal to it or waking it up, depending on whether
+ * %false, if @p is not freezing or already frozen; %true, otherwise
- *      or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
- *      has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- *      TIF_FREEZE flag will not be set.
 */
-bool freeze_task(struct task_struct *p, bool sig_only)
+bool freeze_task(struct task_struct *p)
 {
-        /*
+        unsigned long flags;
-         * We first check if the task is freezing and next if it has already
-         * been frozen to avoid the race with frozen_process() which first marks
+        spin_lock_irqsave(&freezer_lock, flags);
-         * the task as frozen and next clears its TIF_FREEZE.
+        if (!freezing(p) || frozen(p)) {
-         */
+                spin_unlock_irqrestore(&freezer_lock, flags);
-        if (!freezing(p)) {
+                return false;
-                smp_rmb();
-                if (frozen(p))
-                        return false;
-                if (!sig_only || should_send_signal(p))
-                        set_freeze_flag(p);
-                else
-                        return false;
        }
-        if (should_send_signal(p)) {
+        if (!(p->flags & PF_KTHREAD)) {
                fake_signal_wake_up(p);
                /*
                 * fake_signal_wake_up() goes through p's scheduler
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only)
                 * TASK_RUNNING transition can't race with task state
                 * testing in try_to_freeze_tasks().
                 */
-        } else if (sig_only) {
-                return false;
        } else {
                wake_up_state(p, TASK_INTERRUPTIBLE);
        }
+        spin_unlock_irqrestore(&freezer_lock, flags);
        return true;
 }
-void cancel_freezing(struct task_struct *p)
+void __thaw_task(struct task_struct *p)
 {
        unsigned long flags;
-        if (freezing(p)) {
+        /*
-                pr_debug("  clean up: %s\n", p->comm);
+         * Clear freezing and kick @p if FROZEN.  Clearing is guaranteed to
-                clear_freeze_flag(p);
+         * be visible to @p as waking up implies wmb.  Waking up inside
-                spin_lock_irqsave(&p->sighand->siglock, flags);
+         * freezer_lock also prevents wakeups from leaking outside
-                recalc_sigpending_and_wake(p);
+         * refrigerator.
-                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+         */
-        }
+        spin_lock_irqsave(&freezer_lock, flags);
-}
+        if (frozen(p))
+                wake_up_process(p);
-static int __thaw_process(struct task_struct *p)
+        spin_unlock_irqrestore(&freezer_lock, flags);
-{
-        if (frozen(p)) {
-                p->flags &= ~PF_FROZEN;
-                return 1;
-        }
-        clear_freeze_flag(p);
-        return 0;
 }
-/*
+/**
- * Wake up a frozen process
+ * set_freezable - make %current freezable
 *
- * task_lock() is needed to prevent the race with refrigerator() which may
+ * Mark %current freezable and enter refrigerator if necessary.
- * occur if the freezing of tasks fails.  Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
 */
-int thaw_process(struct task_struct *p)
+bool set_freezable(void)
 {
-        task_lock(p);
+        might_sleep();
-        if (__thaw_process(p) == 1) {
-                task_unlock(p);
+        /*
-                wake_up_process(p);
+         * Modify flags while holding freezer_lock.  This ensures the
-                return 1;
+         * freezer notices that we aren't frozen yet or the freezing
-        }
+         * condition is visible to try_to_freeze() below.
-        task_unlock(p);
+         */
-        return 0;
+        spin_lock_irq(&freezer_lock);
+        current->flags &= ~PF_NOFREEZE;
+        spin_unlock_irq(&freezer_lock);
+        return try_to_freeze();
 }
-EXPORT_SYMBOL(thaw_process);
+EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a73dd6c7372d..b7952316016a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
 #define istate core_internal_state__do_not_mess_with_it
-extern int noirqdebug;
+extern bool noirqdebug;
 /*
 * Bits used by threaded handlers:
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c585..1f9e26526b69 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
                return -EINVAL;
        if (intsize < 1)
                return -EINVAL;
+        if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
+            (intspec[0] >= d->hwirq_base + d->nr_irq)))
+                return -EINVAL;
        *out_hwirq = intspec[0];
        *out_type = IRQ_TYPE_NONE;
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
        return 0;
 }
-struct irq_domain_ops irq_domain_simple_ops = {
-        .dt_translate = irq_domain_simple_dt_translate,
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 /**
 * irq_domain_create_simple() - Set up a 'simple' translation range
 */
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
 #endif /* CONFIG_OF_IRQ */
+struct irq_domain_ops irq_domain_simple_ops = {
+#ifdef CONFIG_OF_IRQ
+        .dt_translate = irq_domain_simple_dt_translate,
+#endif /* CONFIG_OF_IRQ */
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1da999f5e746..a9a9dbe49fea 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1292,7 +1292,7 @@ EXPORT_SYMBOL(free_irq);
 *      and to set up the interrupt handler in the right order.
 *
 *      If you want to set up a threaded irq handler for your device
- *      then you need to supply @handler and @thread_fn. @handler ist
+ *      then you need to supply @handler and @thread_fn. @handler is
 *      still called in hard interrupt context and has to check
 *      whether the interrupt originates from the device. If yes it
 *      needs to disable the interrupt on the device and return
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dc813a948be2..611cd6003c45 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
        desc->irqs_unhandled = 0;
 }
-int noirqdebug __read_mostly;
+bool noirqdebug __read_mostly;
 int noirqdebug_setup(char *str)
 {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 30c3c7708132..01d3b70fc98a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -71,6 +71,7 @@ void jump_label_inc(struct jump_label_key *key)
        atomic_inc(&key->enabled);
        jump_label_unlock();
 }
+EXPORT_SYMBOL_GPL(jump_label_inc);
 static void __jump_label_dec(struct jump_label_key *key,
                unsigned long rate_limit, struct delayed_work *work)
@@ -86,6 +87,7 @@ static void __jump_label_dec(struct jump_label_key *key,
        jump_label_unlock();
 }
+EXPORT_SYMBOL_GPL(jump_label_dec);
 static void jump_label_update_timeout(struct work_struct *work)
 {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index dc7bc0829286..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
 #include <linux/console.h>
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
-#include <linux/kmsg_dump.h>
 #include <linux/syscore_ops.h>
 #include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
-                        kmsg_dump(KMSG_DUMP_KEXEC);
                        crash_setup_regs(&fixed_regs, regs);
                        crash_save_vmcoreinfo();
                        machine_crash_shutdown(&fixed_regs);
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
 {
        int ret = 0;
        unsigned long start, end;
+        unsigned long old_size;
+        struct resource *ram_res;
        mutex_lock(&kexec_mutex);
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)
        }
        start = crashk_res.start;
        end = crashk_res.end;
+        old_size = (end == 0) ? 0 : end - start + 1;
+        if (new_size >= old_size) {
+                ret = (new_size == old_size) ? 0 : -EINVAL;
+                goto unlock;
+        }
-        if (new_size >= end - start + 1) {
+        ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-                ret = -EINVAL;
+        if (!ram_res) {
-                if (new_size == end - start + 1)
+                ret = -ENOMEM;
-                        ret = 0;
                goto unlock;
        }
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)
        if ((start == end) && (crashk_res.parent != NULL))
                release_resource(&crashk_res);
+        ram_res->start = end;
+        ram_res->end = crashk_res.end;
+        ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+        ram_res->name = "System RAM";
        crashk_res.end = end - 1;
+        insert_resource(&iomem_resource, ram_res);
        crash_unmap_reserved_pages();
 unlock:
@@ -1523,7 +1534,7 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {
-                mutex_lock(&pm_mutex);
+                lock_system_sleep();
                pm_prepare_console();
                error = freeze_processes();
                if (error) {
@@ -1576,7 +1587,7 @@ int kernel_kexec(void)
                thaw_processes();
 Restore_console:
                pm_restore_console();
-                mutex_unlock(&pm_mutex);
+                unlock_system_sleep();
        }
 #endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a4bea97c75b6..a0a88543934e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,7 @@
 #include <linux/resource.h>
 #include <linux/notifier.h>
 #include <linux/suspend.h>
+#include <linux/rwsem.h>
 #include <asm/uaccess.h>
 #include <trace/events/module.h>
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;
 static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
 #ifdef CONFIG_MODULES
@@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)
 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
 * (used for preventing user land processes from being created after the user
 * land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
 */
 static int usermodehelper_disabled = 1;
@@ -282,17 +285,29 @@ static int usermodehelper_disabled = 1;
 static atomic_t running_helpers = ATOMIC_INIT(0);
 /*
- * Wait queue head used by usermodehelper_pm_callback() to wait for all running
+ * Wait queue head used by usermodehelper_disable() to wait for all running
 * helpers to finish.
 */
 static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
 /*
 * Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_pm_callback() fails
+ * usermodehelper_disabled in usermodehelper_disable() fails
 */
 #define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+void read_lock_usermodehelper(void)
+{
+        down_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
+void read_unlock_usermodehelper(void)
+{
+        up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
 /**
 * usermodehelper_disable - prevent new helpers from being started
 */
@@ -300,8 +315,10 @@ int usermodehelper_disable(void)
 {
        long retval;
+        down_write(&umhelper_sem);
        usermodehelper_disabled = 1;
-        smp_mb();
+        up_write(&umhelper_sem);
        /*
         * From now on call_usermodehelper_exec() won't start any new
         * helpers, so it is sufficient if running_helpers turns out to
@@ -314,7 +331,9 @@ int usermodehelper_disable(void)
        if (retval)
                return 0;
+        down_write(&umhelper_sem);
        usermodehelper_disabled = 0;
+        up_write(&umhelper_sem);
        return -EAGAIN;
 }
@@ -323,7 +342,9 @@ int usermodehelper_disable(void)
 */
 void usermodehelper_enable(void)
 {
+        down_write(&umhelper_sem);
        usermodehelper_disabled = 0;
+        up_write(&umhelper_sem);
 }
 /**
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823b..95dd7212e610 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
               const char __user *user_buf, size_t count, loff_t *ppos)
 {
        char buf[32];
-        int buf_size;
+        size_t buf_size;
        buf_size = min(count, (sizeof(buf)-1));
        if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b6d216a92639..3d3de633702e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -59,6 +59,31 @@ int kthread_should_stop(void)
 EXPORT_SYMBOL(kthread_should_stop);
 /**
+ * kthread_freezable_should_stop - should this freezable kthread return now?
+ * @was_frozen: optional out parameter, indicates whether %current was frozen
+ *
+ * kthread_should_stop() for freezable kthreads, which will enter
+ * refrigerator if necessary.  This function is safe from kthread_stop() /
+ * freezer deadlock and freezable kthreads should use this function instead
+ * of calling try_to_freeze() directly.
+ */
+bool kthread_freezable_should_stop(bool *was_frozen)
+{
+        bool frozen = false;
+        might_sleep();
+        if (unlikely(freezing(current)))
+                frozen = __refrigerator(true);
+        if (was_frozen)
+                *was_frozen = frozen;
+        return kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
+/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
@@ -257,7 +282,7 @@ int kthreadd(void *unused)
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
        set_mems_allowed(node_states[N_HIGH_MEMORY]);
-        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
+        current->flags |= PF_NOFREEZE;
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/module.c b/kernel/module.c
index 178333c48d1e..2c932760fd33 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,12 +62,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt , a...)
-#endif
 #ifndef ARCH_SHF_SMALL
 #define ARCH_SHF_SMALL 0
 #endif
@@ -138,7 +132,6 @@ struct load_info {
        unsigned long len;
        Elf_Shdr *sechdrs;
        char *secstrings, *strtab;
-        unsigned long *strmap;
        unsigned long symoffs, stroffs;
        struct _ddebug *debug;
        unsigned int num_debug;
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
                return fsa.sym;
        }
-        DEBUGP("Failed to find symbol %s\n", name);
+        pr_debug("Failed to find symbol %s\n", name);
        return NULL;
 }
 EXPORT_SYMBOL_GPL(find_symbol);
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
        list_for_each_entry(use, &b->source_list, source_list) {
                if (use->source == a) {
-                        DEBUGP("%s uses %s!\n", a->name, b->name);
+                        pr_debug("%s uses %s!\n", a->name, b->name);
                        return 1;
                }
        }
-        DEBUGP("%s does not use %s!\n", a->name, b->name);
+        pr_debug("%s does not use %s!\n", a->name, b->name);
        return 0;
 }
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
 {
        struct module_use *use;
-        DEBUGP("Allocating new usage for %s.\n", a->name);
+        pr_debug("Allocating new usage for %s.\n", a->name);
        use = kmalloc(sizeof(*use), GFP_ATOMIC);
        if (!use) {
                printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod)
        mutex_lock(&module_mutex);
        list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
                struct module *i = use->target;
-                DEBUGP("%s unusing %s\n", mod->name, i->name);
+                pr_debug("%s unusing %s\n", mod->name, i->name);
                module_put(i);
                list_del(&use->source_list);
                list_del(&use->target_list);
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
        }
 }
-unsigned int module_refcount(struct module *mod)
+unsigned long module_refcount(struct module *mod)
 {
-        unsigned int incs = 0, decs = 0;
+        unsigned long incs = 0, decs = 0;
        int cpu;
        for_each_possible_cpu(cpu)
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
        /* Since we might sleep for some time, release the mutex first */
        mutex_unlock(&module_mutex);
        for (;;) {
-                DEBUGP("Looking at refcount...\n");
+                pr_debug("Looking at refcount...\n");
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (module_refcount(mod) == 0)
                        break;
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
        if (mod->state != MODULE_STATE_LIVE) {
                /* FIXME: if (force), slam module count and wake up
                   waiter --RR */
-                DEBUGP("%s already dying\n", mod->name);
+                pr_debug("%s already dying\n", mod->name);
                ret = -EBUSY;
                goto out;
        }
@@ -854,7 +847,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
        struct module_use *use;
        int printed_something = 0;
-        seq_printf(m, " %u ", module_refcount(mod));
+        seq_printf(m, " %lu ", module_refcount(mod));
        /* Always include a trailing , so userspace can differentiate
           between this and the old multi-field proc format. */
@@ -904,13 +897,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
                           struct module_kobject *mk, char *buffer)
 {
-        return sprintf(buffer, "%u\n", module_refcount(mk->mod));
+        return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
 }
-static struct module_attribute refcnt = {
+static struct module_attribute modinfo_refcnt =
-        .attr = { .name = "refcnt", .mode = 0444 },
+        __ATTR(refcnt, 0444, show_refcnt, NULL);
-        .show = show_refcnt,
-};
 void module_put(struct module *module)
 {
@@ -951,6 +942,26 @@ static inline int module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+        size_t l = 0;
+        if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+                buf[l++] = 'P';
+        if (mod->taints & (1 << TAINT_OOT_MODULE))
+                buf[l++] = 'O';
+        if (mod->taints & (1 << TAINT_FORCED_MODULE))
+                buf[l++] = 'F';
+        if (mod->taints & (1 << TAINT_CRAP))
+                buf[l++] = 'C';
+        /*
+         * TAINT_FORCED_RMMOD: could be added.
+         * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+         * apply to modules.
+         */
+        return l;
+}
 static ssize_t show_initstate(struct module_attribute *mattr,
                              struct module_kobject *mk, char *buffer)
 {
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
        return sprintf(buffer, "%s\n", state);
 }
-static struct module_attribute initstate = {
+static struct module_attribute modinfo_initstate =
-        .attr = { .name = "initstate", .mode = 0444 },
+        __ATTR(initstate, 0444, show_initstate, NULL);
-        .show = show_initstate,
-};
 static ssize_t store_uevent(struct module_attribute *mattr,
                            struct module_kobject *mk,
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
        return count;
 }
-struct module_attribute module_uevent = {
+struct module_attribute module_uevent =
-        .attr = { .name = "uevent", .mode = 0200 },
+        __ATTR(uevent, 0200, NULL, store_uevent);
-        .store = store_uevent,
-};
+static ssize_t show_coresize(struct module_attribute *mattr,
+                             struct module_kobject *mk, char *buffer)
+{
+        return sprintf(buffer, "%u\n", mk->mod->core_size);
+}
+static struct module_attribute modinfo_coresize =
+        __ATTR(coresize, 0444, show_coresize, NULL);
+static ssize_t show_initsize(struct module_attribute *mattr,
+                             struct module_kobject *mk, char *buffer)
+{
+        return sprintf(buffer, "%u\n", mk->mod->init_size);
+}
+static struct module_attribute modinfo_initsize =
+        __ATTR(initsize, 0444, show_initsize, NULL);
+static ssize_t show_taint(struct module_attribute *mattr,
+                          struct module_kobject *mk, char *buffer)
+{
+        size_t l;
+        l = module_flags_taint(mk->mod, buffer);
+        buffer[l++] = '\n';
+        return l;
+}
+static struct module_attribute modinfo_taint =
+        __ATTR(taint, 0444, show_taint, NULL);
 static struct module_attribute *modinfo_attrs[] = {
+        &module_uevent,
        &modinfo_version,
        &modinfo_srcversion,
-        &initstate,
+        &modinfo_initstate,
-        &module_uevent,
+        &modinfo_coresize,
+        &modinfo_initsize,
+        &modinfo_taint,
 #ifdef CONFIG_MODULE_UNLOAD
-        &refcnt,
+        &modinfo_refcnt,
 #endif
        NULL,
 };
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs,
                if (versions[i].crc == maybe_relocated(*crc, crc_owner))
                        return 1;
-                DEBUGP("Found checksum %lX vs module %lX\n",
+                pr_debug("Found checksum %lX vs module %lX\n",
                       maybe_relocated(*crc, crc_owner), versions[i].crc);
                goto bad_version;
        }
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                case SHN_COMMON:
                        /* We compiled with -fno-common.  These are not
                           supposed to happen.  */
-                        DEBUGP("Common symbol: %s\n", name);
+                        pr_debug("Common symbol: %s\n", name);
                        printk("%s: please compile with -fno-common\n",
                               mod->name);
                        ret = -ENOEXEC;
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                case SHN_ABS:
                        /* Don't need to do anything */
-                        DEBUGP("Absolute symbol: 0x%08lx\n",
+                        pr_debug("Absolute symbol: 0x%08lx\n",
                               (long)sym[i].st_value);
                        break;
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
        for (i = 0; i < info->hdr->e_shnum; i++)
                info->sechdrs[i].sh_entsize = ~0UL;
-        DEBUGP("Core section allocation order:\n");
+        pr_debug("Core section allocation order:\n");
        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
                for (i = 0; i < info->hdr->e_shnum; ++i) {
                        Elf_Shdr *s = &info->sechdrs[i];
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                            || strstarts(sname, ".init"))
                                continue;
                        s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
-                        DEBUGP("\t%s\n", name);
+                        pr_debug("\t%s\n", sname);
                }
                switch (m) {
                case 0: /* executable */
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                }
        }
-        DEBUGP("Init section allocation order:\n");
+        pr_debug("Init section allocation order:\n");
        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
                for (i = 0; i < info->hdr->e_shnum; ++i) {
                        Elf_Shdr *s = &info->sechdrs[i];
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                                continue;
                        s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
                                         | INIT_OFFSET_MASK);
-                        DEBUGP("\t%s\n", sname);
+                        pr_debug("\t%s\n", sname);
                }
                switch (m) {
                case 0: /* executable */
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
        return true;
 }
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep.  This is simple, but has the effect of making multiple
+ * copies of duplicates.  We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
 static void layout_symtab(struct module *mod, struct load_info *info)
 {
        Elf_Shdr *symsect = info->sechdrs + info->index.sym;
        Elf_Shdr *strsect = info->sechdrs + info->index.str;
        const Elf_Sym *src;
-        unsigned int i, nsrc, ndst;
+        unsigned int i, nsrc, ndst, strtab_size;
        /* Put symbol section at end of init part of module. */
        symsect->sh_flags |= SHF_ALLOC;
        symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
                                         info->index.sym) | INIT_OFFSET_MASK;
-        DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
+        pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
        src = (void *)info->hdr + symsect->sh_offset;
        nsrc = symsect->sh_size / sizeof(*src);
-        for (ndst = i = 1; i < nsrc; ++i, ++src)
-                if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
-                        unsigned int j = src->st_name;
-                        while (!__test_and_set_bit(j, info->strmap)
+        /* Compute total space required for the core symbols' strtab. */
-                               && info->strtab[j])
+        for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
-                                ++j;
+                if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
-                        ++ndst;
+                        strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+                        ndst++;
                }
        /* Append room for core symbols at end of core part. */
        info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
-        mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+        info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+        mod->core_size += strtab_size;
        /* Put string table section at end of init part of module. */
        strsect->sh_flags |= SHF_ALLOC;
        strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
                                         info->index.str) | INIT_OFFSET_MASK;
-        DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
+        pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-        /* Append room for core symbols' strings at end of core part. */
-        info->stroffs = mod->core_size;
-        __set_bit(0, info->strmap);
-        mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
 }
 static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
                mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
        mod->core_symtab = dst = mod->module_core + info->symoffs;
+        mod->core_strtab = s = mod->module_core + info->stroffs;
        src = mod->symtab;
        *dst = *src;
+        *s++ = 0;
        for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
                if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
                        continue;
                dst[ndst] = *src;
-                dst[ndst].st_name = bitmap_weight(info->strmap,
+                dst[ndst++].st_name = s - mod->core_strtab;
-                                                  dst[ndst].st_name);
+                s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
-                ++ndst;
        }
        mod->core_num_syms = ndst;
-        mod->core_strtab = s = mod->module_core + info->stroffs;
-        for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
-                if (test_bit(i, info->strmap))
-                        *++s = mod->strtab[i];
 }
 #else
 static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2621,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info)
        mod->module_init = ptr;
        /* Transfer each section which specifies SHF_ALLOC */
-        DEBUGP("final section addresses:\n");
+        pr_debug("final section addresses:\n");
        for (i = 0; i < info->hdr->e_shnum; i++) {
                void *dest;
                Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2639,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info)
                        memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
                /* Update sh_addr to point to copy in image. */
                shdr->sh_addr = (unsigned long)dest;
-                DEBUGP("\t0x%lx %s\n",
+                pr_debug("\t0x%lx %s\n",
-                       shdr->sh_addr, info->secstrings + shdr->sh_name);
+                         (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
        }
        return 0;
@@ -2742,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info)
           this is done generically; there doesn't appear to be any
           special cases for the architectures. */
        layout_sections(mod, info);
-        info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
-                         * sizeof(long), GFP_KERNEL);
-        if (!info->strmap) {
-                err = -ENOMEM;
-                goto free_percpu;
-        }
        layout_symtab(mod, info);
        /* Allocate and move to the final place */
        err = move_module(mod, info);
        if (err)
-                goto free_strmap;
+                goto free_percpu;
        /* Module has been copied to its final place now: return it. */
        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
        kmemleak_load_module(mod, info);
        return mod;
-free_strmap:
-        kfree(info->strmap);
 free_percpu:
        percpu_modfree(mod);
 out:
@@ -2772,7 +2802,6 @@ out:
 /* mod is no longer valid after this! */
 static void module_deallocate(struct module *mod, struct load_info *info)
 {
-        kfree(info->strmap);
        percpu_modfree(mod);
        module_free(mod, mod->module_init);
        module_free(mod, mod->module_core);
@@ -2811,7 +2840,7 @@ static struct module *load_module(void __user *umod,
        struct module *mod;
        long err;
-        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+        pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
               umod, len, uargs);
        /* Copy in the blobs from userspace, check they are vaguely sane. */
@@ -2902,8 +2931,7 @@ static struct module *load_module(void __user *umod,
        if (err < 0)
                goto unlink;
-        /* Get rid of temporary copy and strmap. */
+        /* Get rid of temporary copy. */
-        kfree(info.strmap);
        free_copy(&info);
        /* Done! */
@@ -3256,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
            mod->state == MODULE_STATE_GOING ||
            mod->state == MODULE_STATE_COMING) {
                buf[bx++] = '(';
-                if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+                bx += module_flags_taint(mod, buf + bx);
-                        buf[bx++] = 'P';
-                else if (mod->taints & (1 << TAINT_OOT_MODULE))
-                        buf[bx++] = 'O';
-                if (mod->taints & (1 << TAINT_FORCED_MODULE))
-                        buf[bx++] = 'F';
-                if (mod->taints & (1 << TAINT_CRAP))
-                        buf[bx++] = 'C';
-                /*
-                 * TAINT_FORCED_RMMOD: could be added.
-                 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-                 * apply to modules.
-                 */
                /* Show a - for module-is-being-unloaded */
                if (mod->state == MODULE_STATE_GOING)
                        buf[bx++] = '-';
diff --git a/kernel/panic.c b/kernel/panic.c
index 3458469eb7c3..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
 long (*panic_blink)(int state);
 EXPORT_SYMBOL(panic_blink);
+/*
+ * Stop ourself in panic -- architecture code may override this
+ */
+void __weak panic_smp_self_stop(void)
+{
+        while (1)
+                cpu_relax();
+}
 /**
 *      panic - halt the system
 *      @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
 *
 *      This function never returns.
 */
-NORET_TYPE void panic(const char * fmt, ...)
+void panic(const char *fmt, ...)
 {
+        static DEFINE_SPINLOCK(panic_lock);
        static char buf[1024];
        va_list args;
        long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
+         *
+         * Only one CPU is allowed to execute the panic code from here. For
+         * multiple parallel invocations of panic, all other CPUs either
+         * stop themself or will wait until they are stopped by the 1st CPU
+         * with smp_send_stop().
         */
-        preempt_disable();
+        if (!spin_trylock(&panic_lock))
+                panic_smp_self_stop();
        console_verbose();
        bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
        va_end(args);
        printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-        dump_stack();
+        /*
+         * Avoid nested stack-dumping if a panic occurs during oops processing
+         */
+        if (!oops_in_progress)
+                dump_stack();
 #endif
        /*
diff --git a/kernel/params.c b/kernel/params.c
index 65aae11eb93f..32ee04308285 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,6 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt, a...)
-#endif
 /* Protects all parameters, and incidentally kmalloced_param list. */
 static DEFINE_MUTEX(param_lock);
@@ -105,7 +99,7 @@ static int parse_one(char *param,
                        /* No one handled NULL, so do it here. */
                        if (!val && params[i].ops->set != param_set_bool)
                                return -EINVAL;
-                        DEBUGP("They are equal!  Calling %p\n",
+                        pr_debug("They are equal!  Calling %p\n",
                               params[i].ops->set);
                        mutex_lock(&param_lock);
                        err = params[i].ops->set(val, &params[i]);
@@ -115,11 +109,11 @@ static int parse_one(char *param,
        }
        if (handle_unknown) {
-                DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+                pr_debug("Unknown argument: calling %p\n", handle_unknown);
                return handle_unknown(param, val);
        }
-        DEBUGP("Unknown argument `%s'\n", param);
+        pr_debug("Unknown argument `%s'\n", param);
        return -ENOENT;
 }
@@ -184,7 +178,7 @@ int parse_args(const char *name,
 {
        char *param, *val;
-        DEBUGP("Parsing ARGS: %s\n", args);
+        pr_debug("Parsing ARGS: %s\n", args);
        /* Chew leading spaces */
        args = skip_spaces(args);
@@ -369,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
 };
 EXPORT_SYMBOL(param_ops_invbool);
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+        struct kernel_param boolkp;
+        bool v;
+        int ret;
+        /* Match bool exactly, by re-using it. */
+        boolkp = *kp;
+        boolkp.arg = &v;
+        boolkp.flags |= KPARAM_ISBOOL;
+        ret = param_set_bool(val, &boolkp);
+        if (ret == 0)
+                *(int *)kp->arg = v;
+        return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+struct kernel_param_ops param_ops_bint = {
+        .set = param_set_bint,
+        .get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
                       const char *val,
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5f..ce8e00deaccb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
 }
 /*
- * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We might be racing with someone else trying to set pid_ns->last_pid
+ * at the pid allocation time (there's also a sysctl for this, but racing
+ * with this one is OK, see comment in kernel/pid_namespace.c about it).
 * We want the winner to have the "later" value, because if the
 * "earlier" value prevails, then a pid may get reused immediately.
 *
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        return;
 }
+static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        struct ctl_table tmp = *table;
+        if (write && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /*
+         * Writing directly to ns' last_pid field is OK, since this field
+         * is volatile in a living namespace anyway and a code writing to
+         * it should synchronize its usage with external means.
+         */
+        tmp.data = &current->nsproxy->pid_ns->last_pid;
+        return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+static struct ctl_table pid_ns_ctl_table[] = {
+        {
+                .procname = "ns_last_pid",
+                .maxlen = sizeof(int),
+                .mode = 0666, /* permissions are checked in the handler */
+                .proc_handler = pid_ns_ctl_handler,
+        },
+        { }
+};
+static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+        register_sysctl_paths(kern_path, pid_ns_ctl_table);
        return 0;
 }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a6b0503574ee..6d6d28870335 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -43,8 +43,6 @@ int in_suspend __nosavedata;
 enum {
        HIBERNATION_INVALID,
        HIBERNATION_PLATFORM,
-        HIBERNATION_TEST,
-        HIBERNATION_TESTPROC,
        HIBERNATION_SHUTDOWN,
        HIBERNATION_REBOOT,
        /* keep last */
@@ -55,7 +53,7 @@ enum {
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
-static bool freezer_test_done;
+bool freezer_test_done;
 static const struct platform_hibernation_ops *hibernation_ops;
@@ -71,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
                WARN_ON(1);
                return;
        }
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        hibernation_ops = ops;
        if (ops)
                hibernation_mode = HIBERNATION_PLATFORM;
        else if (hibernation_mode == HIBERNATION_PLATFORM)
                hibernation_mode = HIBERNATION_SHUTDOWN;
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
 }
 static bool entering_platform_hibernation;
@@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void)
        mdelay(5000);
 }
-static int hibernation_testmode(int mode)
-{
-        if (hibernation_mode == mode) {
-                hibernation_debug_sleep();
-                return 1;
-        }
-        return 0;
-}
 static int hibernation_test(int level)
 {
        if (pm_test_level == level) {
@@ -114,7 +103,6 @@ static int hibernation_test(int level)
        return 0;
 }
 #else /* !CONFIG_PM_DEBUG */
-static int hibernation_testmode(int mode) { return 0; }
 static int hibernation_test(int level) { return 0; }
 #endif /* !CONFIG_PM_DEBUG */
@@ -278,8 +266,7 @@ static int create_image(int platform_mode)
                goto Platform_finish;
        error = disable_nonboot_cpus();
-        if (error || hibernation_test(TEST_CPUS)
+        if (error || hibernation_test(TEST_CPUS))
-            || hibernation_testmode(HIBERNATION_TEST))
                goto Enable_cpus;
        local_irq_disable();
@@ -333,7 +320,7 @@ static int create_image(int platform_mode)
 */
 int hibernation_snapshot(int platform_mode)
 {
-        pm_message_t msg = PMSG_RECOVER;
+        pm_message_t msg;
        int error;
        error = platform_begin(platform_mode);
@@ -349,8 +336,7 @@ int hibernation_snapshot(int platform_mode)
        if (error)
                goto Cleanup;
-        if (hibernation_test(TEST_FREEZER) ||
+        if (hibernation_test(TEST_FREEZER)) {
-                hibernation_testmode(HIBERNATION_TESTPROC)) {
                /*
                 * Indicate to the caller that we are returning due to a
@@ -362,26 +348,26 @@ int hibernation_snapshot(int platform_mode)
        error = dpm_prepare(PMSG_FREEZE);
        if (error) {
-                dpm_complete(msg);
+                dpm_complete(PMSG_RECOVER);
                goto Cleanup;
        }
        suspend_console();
        pm_restrict_gfp_mask();
        error = dpm_suspend(PMSG_FREEZE);
-        if (error)
-                goto Recover_platform;
-        if (hibernation_test(TEST_DEVICES))
+        if (error || hibernation_test(TEST_DEVICES))
-                goto Recover_platform;
+                platform_recover(platform_mode);
+        else
+                error = create_image(platform_mode);
-        error = create_image(platform_mode);
        /*
-         * Control returns here (1) after the image has been created or the
+         * In the case that we call create_image() above, the control
+         * returns here (1) after the image has been created or the
         * image creation has failed and (2) after a successful restore.
         */
- Resume_devices:
        /* We may need to release the preallocated image pages here. */
        if (error || !in_suspend)
                swsusp_free();
@@ -399,10 +385,6 @@ int hibernation_snapshot(int platform_mode)
        platform_end(platform_mode);
        return error;
- Recover_platform:
-        platform_recover(platform_mode);
-        goto Resume_devices;
 Cleanup:
        swsusp_free();
        goto Close;
@@ -590,9 +572,6 @@ int hibernation_platform_enter(void)
 static void power_down(void)
 {
        switch (hibernation_mode) {
-        case HIBERNATION_TEST:
-        case HIBERNATION_TESTPROC:
-                break;
        case HIBERNATION_REBOOT:
                kernel_restart(NULL);
                break;
@@ -611,17 +590,6 @@ static void power_down(void)
        while(1);
 }
-static int prepare_processes(void)
-{
-        int error = 0;
-        if (freeze_processes()) {
-                error = -EBUSY;
-                thaw_processes();
-        }
-        return error;
-}
 /**
 * hibernate - Carry out system hibernation, including saving the image.
 */
@@ -629,7 +597,7 @@ int hibernate(void)
 {
        int error;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        /* The snapshot device should not be opened while we're running */
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
                error = -EBUSY;
@@ -654,7 +622,7 @@ int hibernate(void)
        sys_sync();
        printk("done.\n");
-        error = prepare_processes();
+        error = freeze_processes();
        if (error)
                goto Finish;
@@ -697,7 +665,7 @@ int hibernate(void)
        pm_restore_console();
        atomic_inc(&snapshot_device_available);
 Unlock:
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return error;
 }
@@ -811,11 +779,13 @@ static int software_resume(void)
                goto close_finish;
        error = create_basic_memory_bitmaps();
-        if (error)
+        if (error) {
+                usermodehelper_enable();
                goto close_finish;
+        }
        pr_debug("PM: Preparing processes for restore.\n");
-        error = prepare_processes();
+        error = freeze_processes();
        if (error) {
                swsusp_close(FMODE_READ);
                goto Done;
@@ -855,8 +825,6 @@ static const char * const hibernation_modes[] = {
        [HIBERNATION_PLATFORM]  = "platform",
        [HIBERNATION_SHUTDOWN]  = "shutdown",
        [HIBERNATION_REBOOT]    = "reboot",
-        [HIBERNATION_TEST]      = "test",
-        [HIBERNATION_TESTPROC]  = "testproc",
 };
 /*
@@ -865,17 +833,15 @@ static const char * const hibernation_modes[] = {
 * Hibernation can be handled in several ways.  There are a few different ways
 * to put the system into the sleep state: using the platform driver (e.g. ACPI
 * or other hibernation_ops), powering it off or rebooting it (for testing
- * mostly), or using one of the two available test modes.
+ * mostly).
 *
 * The sysfs file /sys/power/disk provides an interface for selecting the
 * hibernation mode to use.  Reading from this file causes the available modes
- * to be printed.  There are 5 modes that can be supported:
+ * to be printed.  There are 3 modes that can be supported:
 *
 *      'platform'
 *      'shutdown'
 *      'reboot'
- *      'test'
- *      'testproc'
 *
 * If a platform hibernation driver is in use, 'platform' will be supported
 * and will be used by default.  Otherwise, 'shutdown' will be used by default.
@@ -899,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
                switch (i) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
-                case HIBERNATION_TEST:
-                case HIBERNATION_TESTPROC:
                        break;
                case HIBERNATION_PLATFORM:
                        if (hibernation_ops)
@@ -929,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
                if (len == strlen(hibernation_modes[i])
                    && !strncmp(buf, hibernation_modes[i], len)) {
@@ -941,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
                switch (mode) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
-                case HIBERNATION_TEST:
-                case HIBERNATION_TESTPROC:
                        hibernation_mode = mode;
                        break;
                case HIBERNATION_PLATFORM:
@@ -957,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (!error)
                pr_debug("PM: Hibernation mode set to '%s'\n",
                         hibernation_modes[mode]);
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return error ? error : n;
 }
@@ -984,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (maj != MAJOR(res) || min != MINOR(res))
                goto out;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        swsusp_resume_device = res;
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        printk(KERN_INFO "PM: Starting manual resume from disk\n");
        noresume = 0;
        software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 36e0f0903c32..9824b41e5a18 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,7 +3,7 @@
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
- * 
+ *
 * This file is released under the GPLv2
 *
 */
@@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        level = TEST_FIRST;
        for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
                        break;
                }
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return error ? error : n;
 }
@@ -240,7 +240,7 @@ struct kobject *power_kobj;
 *      'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
 *      'disk' (Suspend-to-Disk).
 *
- *      store() accepts one of those strings, translates it into the 
+ *      store() accepts one of those strings, translates it into the
 *      proper enumerated value, and initiates a suspend transition.
 */
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
        /* First, check if we are requested to hibernate */
        if (len == 4 && !strncmp(buf, "disk", len)) {
                error = hibernate();
-  goto Exit;
+                goto Exit;
        }
 #ifdef CONFIG_SUSPEND
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 23a2db1ec442..0c4defe6d3b8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 #define SPARE_PAGES     ((1024 * 1024) >> PAGE_SHIFT)
 /* kernel/power/hibernate.c */
+extern bool freezer_test_done;
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index addbbe5531bc..77274c9ba2f1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,16 +22,7 @@
 */
 #define TIMEOUT (20 * HZ)
-static inline int freezable(struct task_struct * p)
+static int try_to_freeze_tasks(bool user_only)
-{
-        if ((p == current) ||
-            (p->flags & PF_NOFREEZE) ||
-            (p->exit_state != 0))
-                return 0;
-        return 1;
-}
-static int try_to_freeze_tasks(bool sig_only)
 {
        struct task_struct *g, *p;
        unsigned long end_time;
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)
        end_time = jiffies + TIMEOUT;
-        if (!sig_only)
+        if (!user_only)
                freeze_workqueues_begin();
        while (true) {
                todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
-                        if (frozen(p) || !freezable(p))
+                        if (p == current || !freeze_task(p))
-                                continue;
-                        if (!freeze_task(p, sig_only))
                                continue;
                        /*
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
-                if (!sig_only) {
+                if (!user_only) {
                        wq_busy = freeze_workqueues_busy();
                        todo += wq_busy;
                }
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)
        elapsed_csecs = elapsed_csecs64;
        if (todo) {
-                /* This does not unfreeze processes that are already frozen
-                 * (we have slightly ugly calling convention in that respect,
-                 * and caller must call thaw_processes() if something fails),
-                 * but it cleans up leftover PF_FREEZE requests.
-                 */
                printk("\n");
                printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)
                       elapsed_csecs / 100, elapsed_csecs % 100,
                       todo - wq_busy, wq_busy);
-                thaw_workqueues();
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
-                        task_lock(p);
+                        if (!wakeup && !freezer_should_skip(p) &&
-                        if (!wakeup && freezing(p) && !freezer_should_skip(p))
+                            p != current && freezing(p) && !frozen(p))
                                sched_show_task(p);
-                        cancel_freezing(p);
-                        task_unlock(p);
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
        } else {
@@ -136,12 +115,18 @@ static int try_to_freeze_tasks(bool sig_only)
 /**
 * freeze_processes - Signal user space processes to enter the refrigerator.
+ *
+ * On success, returns 0.  On failure, -errno and system is fully thawed.
 */
 int freeze_processes(void)
 {
        int error;
+        if (!pm_freezing)
+                atomic_inc(&system_freezing_cnt);
        printk("Freezing user space processes ... ");
+        pm_freezing = true;
        error = try_to_freeze_tasks(true);
        if (!error) {
                printk("done.");
@@ -150,17 +135,22 @@ int freeze_processes(void)
        printk("\n");
        BUG_ON(in_atomic());
+        if (error)
+                thaw_processes();
        return error;
 }
 /**
 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+ *
+ * On success, returns 0.  On failure, -errno and system is fully thawed.
 */
 int freeze_kernel_threads(void)
 {
        int error;
        printk("Freezing remaining freezable tasks ... ");
+        pm_nosig_freezing = true;
        error = try_to_freeze_tasks(false);
        if (!error)
                printk("done.");
@@ -168,37 +158,32 @@ int freeze_kernel_threads(void)
        printk("\n");
        BUG_ON(in_atomic());
+        if (error)
+                thaw_processes();
        return error;
 }
-static void thaw_tasks(bool nosig_only)
+void thaw_processes(void)
 {
        struct task_struct *g, *p;
-        read_lock(&tasklist_lock);
+        if (pm_freezing)
-        do_each_thread(g, p) {
+                atomic_dec(&system_freezing_cnt);
-                if (!freezable(p))
+        pm_freezing = false;
-                        continue;
+        pm_nosig_freezing = false;
-                if (nosig_only && should_send_signal(p))
+        oom_killer_enable();
-                        continue;
+        printk("Restarting tasks ... ");
-                if (cgroup_freezing_or_frozen(p))
+        thaw_workqueues();
-                        continue;
-                thaw_process(p);
+        read_lock(&tasklist_lock);
+        do_each_thread(g, p) {
+                __thaw_task(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
-}
-void thaw_processes(void)
-{
-        oom_killer_enable();
-        printk("Restarting tasks ... ");
-        thaw_workqueues();
-        thaw_tasks(true);
-        thaw_tasks(false);
        schedule();
        printk("done.\n");
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cbe2c1441392..1cf88900ec4f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
            PageReserved(page))
                return NULL;
+        if (page_is_guard(page))
+                return NULL;
        return page;
 }
@@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
            && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
                return NULL;
+        if (page_is_guard(page))
+                return NULL;
        return page;
 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4953dc054c53..4fd51beed879 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;
 */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        suspend_ops = ops;
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
 }
 EXPORT_SYMBOL_GPL(suspend_set_ops);
@@ -106,13 +106,11 @@ static int suspend_prepare(void)
                goto Finish;
        error = suspend_freeze_processes();
-        if (error) {
+        if (!error)
-                suspend_stats.failed_freeze++;
-                dpm_save_failed_step(SUSPEND_FREEZE);
-        } else
                return 0;
-        suspend_thaw_processes();
+        suspend_stats.failed_freeze++;
+        dpm_save_failed_step(SUSPEND_FREEZE);
        usermodehelper_enable();
 Finish:
        pm_notifier_call_chain(PM_POST_SUSPEND);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11a594c4ba25..8742fd013a94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -18,7 +18,6 @@
 #include <linux/bitops.h>
 #include <linux/genhd.h>
 #include <linux/device.h>
-#include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
@@ -774,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
        pr_debug("PM: Free swap pages: %u\n", free_swap);
-        required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
+        required = PAGES_FOR_IO + nr_pages;
-                nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
        return free_swap > required;
 }
@@ -803,10 +801,12 @@ int swsusp_write(unsigned int flags)
                printk(KERN_ERR "PM: Cannot get swap writer\n");
                return error;
        }
-        if (!enough_swap(pages, flags)) {
+        if (flags & SF_NOCOMPRESS_MODE) {
-                printk(KERN_ERR "PM: Not enough free swap\n");
+                if (!enough_swap(pages, flags)) {
-                error = -ENOSPC;
+                        printk(KERN_ERR "PM: Not enough free swap\n");
-                goto out_finish;
+                        error = -ENOSPC;
+                        goto out_finish;
+                }
        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
        error = snapshot_read_next(&snapshot);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6d8f535c2b88..6b1ab7a88522 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -21,6 +21,7 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/fs.h>
+#include <linux/compat.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
@@ -30,28 +31,6 @@
 #include "power.h"
-/*
- * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
- * will be removed in the future.  They are only preserved here for
- * compatibility with existing userland utilities.
- */
-#define SNAPSHOT_SET_SWAP_FILE  _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
-#define SNAPSHOT_PMOPS          _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
-#define PMOPS_PREPARE   1
-#define PMOPS_ENTER     2
-#define PMOPS_FINISH    3
-/*
- * NOTE: The following ioctl definitions are wrong and have been replaced with
- * correct ones.  They are only preserved here for compatibility with existing
- * userland utilities and will be removed in the future.
- */
-#define SNAPSHOT_ATOMIC_SNAPSHOT        _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
-#define SNAPSHOT_SET_IMAGE_SIZE         _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
-#define SNAPSHOT_AVAIL_SWAP             _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
-#define SNAPSHOT_GET_SWAP_PAGE          _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
 #define SNAPSHOT_MINOR  231
@@ -71,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        struct snapshot_data *data;
        int error;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
                error = -EBUSY;
@@ -123,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        data->platform_support = 0;
 Unlock:
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return error;
 }
@@ -132,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 {
        struct snapshot_data *data;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        swsusp_free();
        free_basic_memory_bitmaps();
@@ -146,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return 0;
 }
@@ -158,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
        ssize_t res;
        loff_t pg_offp = *offp & ~PAGE_MASK;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        data = filp->private_data;
        if (!data->ready) {
@@ -179,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
                *offp += res;
 Unlock:
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return res;
 }
@@ -191,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        ssize_t res;
        loff_t pg_offp = *offp & ~PAGE_MASK;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        data = filp->private_data;
@@ -208,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        if (res > 0)
                *offp += res;
 unlock:
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return res;
 }
-static void snapshot_deprecated_ioctl(unsigned int cmd)
-{
-        if (printk_ratelimit())
-                printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
-                                "be removed soon, update your suspend-to-disk "
-                                "utilities\n",
-                                __builtin_return_address(0), cmd);
-}
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -257,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        break;
                error = freeze_processes();
-                if (error) {
+                if (error)
-                        thaw_processes();
                        usermodehelper_enable();
-                }
+                else
-                if (!error)
                        data->frozen = 1;
                break;
@@ -274,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->frozen = 0;
                break;
-        case SNAPSHOT_ATOMIC_SNAPSHOT:
-                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_CREATE_IMAGE:
                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
                        error = -EPERM;
@@ -283,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                }
                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
-                if (!error)
+                if (!error) {
                        error = put_user(in_suspend, (int __user *)arg);
-                if (!error)
+                        if (!error && !freezer_test_done)
-                        data->ready = 1;
+                                data->ready = 1;
+                        if (freezer_test_done) {
+                                freezer_test_done = false;
+                                thaw_processes();
+                        }
+                }
                break;
        case SNAPSHOT_ATOMIC_RESTORE:
@@ -305,8 +276,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->ready = 0;
                break;
-        case SNAPSHOT_SET_IMAGE_SIZE:
-                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_PREF_IMAGE_SIZE:
                image_size = arg;
                break;
@@ -321,16 +290,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_AVAIL_SWAP:
-                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_AVAIL_SWAP_SIZE:
                size = count_swap_pages(data->swap, 1);
                size <<= PAGE_SHIFT;
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_GET_SWAP_PAGE:
-                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_ALLOC_SWAP_PAGE:
                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
                        error = -ENODEV;
@@ -353,27 +318,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                free_all_swap_pages(data->swap);
                break;
-        case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
-                snapshot_deprecated_ioctl(cmd);
-                if (!swsusp_swap_in_use()) {
-                        /*
-                         * User space encodes device types as two-byte values,
-                         * so we need to recode them
-                         */
-                        if (old_decode_dev(arg)) {
-                                data->swap = swap_type_of(old_decode_dev(arg),
-                                                        0, NULL);
-                                if (data->swap < 0)
-                                        error = -ENODEV;
-                        } else {
-                                data->swap = -1;
-                                error = -EINVAL;
-                        }
-                } else {
-                        error = -EPERM;
-                }
-                break;
        case SNAPSHOT_S2RAM:
                if (!data->frozen) {
                        error = -EPERM;
@@ -396,33 +340,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        error = hibernation_platform_enter();
                break;
-        case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
-                snapshot_deprecated_ioctl(cmd);
-                error = -EINVAL;
-                switch (arg) {
-                case PMOPS_PREPARE:
-                        data->platform_support = 1;
-                        error = 0;
-                        break;
-                case PMOPS_ENTER:
-                        if (data->platform_support)
-                                error = hibernation_platform_enter();
-                        break;
-                case PMOPS_FINISH:
-                        if (data->platform_support)
-                                error = 0;
-                        break;
-                default:
-                        printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
-                }
-                break;
        case SNAPSHOT_SET_SWAP_AREA:
                if (swsusp_swap_in_use()) {
                        error = -EPERM;
@@ -464,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        return error;
 }
+#ifdef CONFIG_COMPAT
+struct compat_resume_swap_area {
+        compat_loff_t offset;
+        u32 dev;
+} __packed;
+static long
+snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
+        switch (cmd) {
+        case SNAPSHOT_GET_IMAGE_SIZE:
+        case SNAPSHOT_AVAIL_SWAP_SIZE:
+        case SNAPSHOT_ALLOC_SWAP_PAGE: {
+                compat_loff_t __user *uoffset = compat_ptr(arg);
+                loff_t offset;
+                mm_segment_t old_fs;
+                int err;
+                old_fs = get_fs();
+                set_fs(KERNEL_DS);
+                err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
+                set_fs(old_fs);
+                if (!err && put_user(offset, uoffset))
+                        err = -EFAULT;
+                return err;
+        }
+        case SNAPSHOT_CREATE_IMAGE:
+                return snapshot_ioctl(file, cmd,
+                                      (unsigned long) compat_ptr(arg));
+        case SNAPSHOT_SET_SWAP_AREA: {
+                struct compat_resume_swap_area __user *u_swap_area =
+                        compat_ptr(arg);
+                struct resume_swap_area swap_area;
+                mm_segment_t old_fs;
+                int err;
+                err = get_user(swap_area.offset, &u_swap_area->offset);
+                err |= get_user(swap_area.dev, &u_swap_area->dev);
+                if (err)
+                        return -EFAULT;
+                old_fs = get_fs();
+                set_fs(KERNEL_DS);
+                err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
+                                     (unsigned long) &swap_area);
+                set_fs(old_fs);
+                return err;
+        }
+        default:
+                return snapshot_ioctl(file, cmd, arg);
+        }
+}
+#endif /* CONFIG_COMPAT */
 static const struct file_operations snapshot_fops = {
        .open = snapshot_open,
        .release = snapshot_release,
@@ -471,6 +448,9 @@ static const struct file_operations snapshot_fops = {
        .write = snapshot_write,
        .llseek = no_llseek,
        .unlocked_ioctl = snapshot_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = snapshot_compat_ioctl,
+#endif
 };
 static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 989e4a52da76..13c0a1143f49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
        }
 }
-static int __read_mostly ignore_loglevel;
+static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
 {
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str)
 }
 early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
        "print all kernel messages to the console.");
@@ -696,9 +696,9 @@ static void zap_locks(void)
 }
 #if defined(CONFIG_PRINTK_TIME)
-static int printk_time = 1;
+static bool printk_time = 1;
 #else
-static int printk_time = 0;
+static bool printk_time = 0;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
@@ -1098,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
        return -1;
 }
-int console_suspend_enabled = 1;
+bool console_suspend_enabled = 1;
 EXPORT_SYMBOL(console_suspend_enabled);
 static int __init console_suspend_disable(char *str)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 78ab24a7b0e4..00ab2ca5ed11 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -172,6 +172,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
        return ret;
 }
+static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+{
+        if (mode & PTRACE_MODE_NOAUDIT)
+                return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+        else
+                return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+}
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
        const struct cred *cred = current_cred(), *tcred;
@@ -198,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
             cred->gid == tcred->sgid &&
             cred->gid == tcred->gid))
                goto ok;
-        if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+        if (ptrace_has_cap(tcred->user->user_ns, mode))
                goto ok;
        rcu_read_unlock();
        return -EPERM;
@@ -207,7 +215,7 @@ ok:
        smp_rmb();
        if (task->mm)
                dumpable = get_dumpable(task->mm);
-        if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
+        if (!dumpable  && !ptrace_has_cap(task_user_ns(task), mode))
                return -EPERM;
        return security_ptrace_access_check(task, mode);
@@ -277,7 +285,7 @@ static int ptrace_attach(struct task_struct *task, long request,
        task->ptrace = PT_PTRACED;
        if (seize)
                task->ptrace |= PT_SEIZED;
-        if (task_ns_capable(task, CAP_SYS_PTRACE))
+        if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
        __ptrace_link(task, current);
diff --git a/kernel/relay.c b/kernel/relay.c
index 226fade4d727..4335e1d7ee2d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 */
 static struct dentry *create_buf_file_default_callback(const char *filename,
                                                       struct dentry *parent,
-                                                       int mode,
+                                                       umode_t mode,
                                                       struct rchan_buf *buf,
                                                       int *is_global)
 {
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cce..6d269cce7aa1 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf,
                return 0;
        }
-        /* FIXME - make memparse() take const char* args */
+        *res = memparse(buf, &end);
-        *res = memparse((char *)buf, &end);
        if (*end != '\0')
                return -EINVAL;
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 3d9f31cd79e7..98ec49475460 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
 *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
 */
+#include <linux/device.h>
 #include <linux/kthread.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
-#include <linux/sysdev.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
@@ -27,7 +27,7 @@ struct test_thread_data {
        int                     opdata;
        int                     mutexes[MAX_RT_TEST_MUTEXES];
        int                     event;
-        struct sys_device       sysdev;
+        struct device           dev;
 };
 static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
 *
 * opcode:data
 */
-static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
                                  const char *buf, size_t count)
 {
        struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
        char cmdbuf[32];
        int op, dat, tid, ret;
-        td = container_of(dev, struct test_thread_data, sysdev);
+        td = container_of(dev, struct test_thread_data, dev);
-        tid = td->sysdev.id;
+        tid = td->dev.id;
        /* strings from sysfs write are not 0 terminated! */
        if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
 * @dev:        thread to query
 * @buf:        char buffer to be filled with thread status info
 */
-static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
                                 char *buf)
 {
        struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
        char *curr = buf;
        int i;
-        td = container_of(dev, struct test_thread_data, sysdev);
+        td = container_of(dev, struct test_thread_data, dev);
-        tsk = threads[td->sysdev.id];
+        tsk = threads[td->dev.id];
        spin_lock(&rttest_lock);
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
        spin_unlock(&rttest_lock);
        curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
-                        mutexes[td->sysdev.id].owner);
+                        mutexes[td->dev.id].owner);
        return curr - buf;
 }
-static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
-static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
-static struct sysdev_class rttest_sysclass = {
+static struct bus_type rttest_subsys = {
        .name = "rttest",
+        .dev_name = "rttest",
 };
 static int init_test_thread(int id)
 {
-        thread_data[id].sysdev.cls = &rttest_sysclass;
+        thread_data[id].dev.bus = &rttest_subsys;
-        thread_data[id].sysdev.id = id;
+        thread_data[id].dev.id = id;
        threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
        if (IS_ERR(threads[id]))
                return PTR_ERR(threads[id]);
-        return sysdev_register(&thread_data[id].sysdev);
+        return device_register(&thread_data[id].dev);
 }
 static int init_rttest(void)
@@ -393,7 +394,7 @@ static int init_rttest(void)
        for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
                rt_mutex_init(&mutexes[i]);
-        ret = sysdev_class_register(&rttest_sysclass);
+        ret = subsys_system_register(&rttest_subsys, NULL);
        if (ret)
                return ret;
@@ -401,10 +402,10 @@ static int init_rttest(void)
                ret = init_test_thread(i);
                if (ret)
                        break;
-                ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+                ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
                if (ret)
                        break;
-                ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+                ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
                if (ret)
                        break;
        }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4dbfd04a2148..df00cb09263e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4330,7 +4330,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-        if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+        if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
                goto out_unlock;
        retval = security_task_setscheduler(p);
@@ -5176,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 static void
 set_table_entry(struct ctl_table *entry,
                const char *procname, void *data, int maxlen,
-                mode_t mode, proc_handler *proc_handler)
+                umode_t mode, proc_handler *proc_handler)
 {
        entry->procname = procname;
        entry->data = data;
@@ -6675,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+static ssize_t sched_mc_power_savings_show(struct device *dev,
-                                           struct sysdev_class_attribute *attr,
+                                           struct device_attribute *attr,
-                                           char *page)
+                                           char *buf)
 {
-        return sprintf(page, "%u\n", sched_mc_power_savings);
+        return sprintf(buf, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
-                                            struct sysdev_class_attribute *attr,
+                                            struct device_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
-                         sched_mc_power_savings_show,
+                   sched_mc_power_savings_show,
-                         sched_mc_power_savings_store);
+                   sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+static ssize_t sched_smt_power_savings_show(struct device *dev,
-                                            struct sysdev_class_attribute *attr,
+                                            struct device_attribute *attr,
-                                            char *page)
+                                            char *buf)
 {
-        return sprintf(page, "%u\n", sched_smt_power_savings);
+        return sprintf(buf, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
-                                             struct sysdev_class_attribute *attr,
+                                            struct device_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
                   sched_smt_power_savings_show,
                   sched_smt_power_savings_store);
 #endif
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
 {
        int err = 0;
 #ifdef CONFIG_SCHED_SMT
        if (smt_capable())
-                err = sysfs_create_file(&cls->kset.kobj,
+                err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-                                        &attr_sched_smt_power_savings.attr);
 #endif
 #ifdef CONFIG_SCHED_MC
        if (!err && mc_capable())
-                err = sysfs_create_file(&cls->kset.kobj,
+                err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-                                        &attr_sched_mc_power_savings.attr);
 #endif
        return err;
 }
@@ -7136,10 +7134,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 #endif
-#ifdef CONFIG_RT_GROUP_SCHED
-#else /* !CONFIG_RT_GROUP_SCHED */
-#endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
@@ -7248,9 +7242,6 @@ void sched_move_task(struct task_struct *tsk)
 }
 #endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#endif
 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
@@ -7565,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
        sched_destroy_group(tg);
 }
-static int
+static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+                                 struct cgroup_taskset *tset)
 {
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, cgrp, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
-        if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
+                if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
-                return -EINVAL;
+                        return -EINVAL;
 #else
-        /* We don't support RT-tasks being in separate groups */
+                /* We don't support RT-tasks being in separate groups */
-        if (tsk->sched_class != &fair_sched_class)
+                if (task->sched_class != &fair_sched_class)
-                return -EINVAL;
+                        return -EINVAL;
 #endif
+        }
        return 0;
 }
-static void
+static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+                              struct cgroup_taskset *tset)
 {
-        sched_move_task(tsk);
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, cgrp, tset)
+                sched_move_task(task);
 }
 static void
@@ -7917,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-        .can_attach_task = cpu_cgroup_can_attach_task,
+        .can_attach     = cpu_cgroup_can_attach,
-        .attach_task    = cpu_cgroup_attach_task,
+        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e42de9105f8..84adb2d66cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 }
 #define LBF_ALL_PINNED  0x01
-#define LBF_NEED_BREAK  0x02
+#define LBF_NEED_BREAK  0x02    /* clears into HAD_BREAK */
-#define LBF_ABORT       0x04
+#define LBF_HAD_BREAK   0x04
+#define LBF_HAD_BREAKS  0x0C    /* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT       0x10
 /*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
@@ -4508,7 +4510,9 @@ redo:
                        goto out_balanced;
                if (lb_flags & LBF_NEED_BREAK) {
-                        lb_flags &= ~LBF_NEED_BREAK;
+                        lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+                        if (lb_flags & LBF_ABORT)
+                                goto out_balanced;
                        goto redo;
                }
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13b631d..e8d76c5895ea 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,6 +6,7 @@
 * This defines a simple but solid secure-computing mode.
 */
+#include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
 #include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
 #ifdef SECCOMP_DEBUG
        dump_stack();
 #endif
+        audit_seccomp(this_syscall);
        do_exit(SIGKILL);
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 56ce3a618b28..c73c4284160e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,6 +28,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+                                struct user_namespace *ns)
+{
+        return user_ns_map_uid(ns, cred, cred->uid);
+}
+#ifdef CONFIG_USER_NS
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+        if (current_user_ns() == task_cred_xxx(t, user_ns))
+                return;
+        if (SI_FROMKERNEL(info))
+                return;
+        info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+                                        current_cred(), info->si_uid);
+}
+#else
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+        return;
+}
+#endif
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        int group, int from_ancestor_ns)
 {
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                                q->info.si_pid = 0;
                        break;
                }
+                userns_fixup_signal_uid(&q->info, t);
        } else if (!is_si_special(info)) {
                if (sig >= SIGRTMIN && info->si_code != SI_USER) {
                        /*
@@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-        info.si_uid = __task_cred(tsk)->uid;
+        info.si_uid = map_cred_ns(__task_cred(tsk),
+                        task_cred_xxx(tsk->parent, user_ns));
        rcu_read_unlock();
        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-        info.si_uid = __task_cred(tsk)->uid;
+        info.si_uid = map_cred_ns(__task_cred(tsk),
+                        task_cred_xxx(parent, user_ns));
        rcu_read_unlock();
        info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
+                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
-                info->si_uid = task_uid(current->parent);
+                info->si_uid = map_cred_ns(__task_cred(current->parent),
+                                current_user_ns());
+                rcu_read_unlock();
        }
        /* If the (new) signal is now blocked, requeue it.  */
@@ -2318,6 +2355,27 @@ relock:
        return signr;
 }
+/**
+ * block_sigmask - add @ka's signal mask to current->blocked
+ * @ka: action for @signr
+ * @signr: signal that has been successfully delivered
+ *
+ * This function should be called when a signal has succesfully been
+ * delivered. It adds the mask of signals for @ka to current->blocked
+ * so that they are blocked during the execution of the signal
+ * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * set in @ka->sa.sa_flags.
+ */
+void block_sigmask(struct k_sigaction *ka, int signr)
+{
+        sigset_t blocked;
+        sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+        if (!(ka->sa.sa_flags & SA_NODEFER))
+                sigaddset(&blocked, signr);
+        set_current_blocked(&blocked);
+}
 /*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
@@ -2355,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)
        int group_stop = 0;
        sigset_t unblocked;
+        /*
+         * @tsk is about to have PF_EXITING set - lock out users which
+         * expect stable threadgroup.
+         */
+        threadgroup_change_begin(tsk);
        if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
                tsk->flags |= PF_EXITING;
+                threadgroup_change_end(tsk);
                return;
        }
@@ -2366,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)
         * see wants_signal(), do_signal_stop().
         */
        tsk->flags |= PF_EXITING;
+        threadgroup_change_end(tsk);
        if (!signal_pending(tsk))
                goto out;
diff --git a/kernel/sys.c b/kernel/sys.c
index ddf8155bf3f8..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
        return mask;
 }
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm(int opt, unsigned long addr,
+                        unsigned long arg4, unsigned long arg5)
+{
+        unsigned long rlim = rlimit(RLIMIT_DATA);
+        unsigned long vm_req_flags;
+        unsigned long vm_bad_flags;
+        struct vm_area_struct *vma;
+        int error = 0;
+        struct mm_struct *mm = current->mm;
+        if (arg4 | arg5)
+                return -EINVAL;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (addr >= TASK_SIZE)
+                return -EINVAL;
+        down_read(&mm->mmap_sem);
+        vma = find_vma(mm, addr);
+        if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
+                /* It must be existing VMA */
+                if (!vma || vma->vm_start > addr)
+                        goto out;
+        }
+        error = -EINVAL;
+        switch (opt) {
+        case PR_SET_MM_START_CODE:
+        case PR_SET_MM_END_CODE:
+                vm_req_flags = VM_READ | VM_EXEC;
+                vm_bad_flags = VM_WRITE | VM_MAYSHARE;
+                if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+                    (vma->vm_flags & vm_bad_flags))
+                        goto out;
+                if (opt == PR_SET_MM_START_CODE)
+                        mm->start_code = addr;
+                else
+                        mm->end_code = addr;
+                break;
+        case PR_SET_MM_START_DATA:
+        case PR_SET_MM_END_DATA:
+                vm_req_flags = VM_READ | VM_WRITE;
+                vm_bad_flags = VM_EXEC | VM_MAYSHARE;
+                if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+                    (vma->vm_flags & vm_bad_flags))
+                        goto out;
+                if (opt == PR_SET_MM_START_DATA)
+                        mm->start_data = addr;
+                else
+                        mm->end_data = addr;
+                break;
+        case PR_SET_MM_START_STACK:
+#ifdef CONFIG_STACK_GROWSUP
+                vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
+#else
+                vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
+#endif
+                if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
+                        goto out;
+                mm->start_stack = addr;
+                break;
+        case PR_SET_MM_START_BRK:
+                if (addr <= mm->end_data)
+                        goto out;
+                if (rlim < RLIM_INFINITY &&
+                    (mm->brk - addr) +
+                    (mm->end_data - mm->start_data) > rlim)
+                        goto out;
+                mm->start_brk = addr;
+                break;
+        case PR_SET_MM_BRK:
+                if (addr <= mm->end_data)
+                        goto out;
+                if (rlim < RLIM_INFINITY &&
+                    (addr - mm->start_brk) +
+                    (mm->end_data - mm->start_data) > rlim)
+                        goto out;
+                mm->brk = addr;
+                break;
+        default:
+                error = -EINVAL;
+                goto out;
+        }
+        error = 0;
+out:
+        up_read(&mm->mmap_sem);
+        return error;
+}
+#else /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_mm(int opt, unsigned long addr,
+                        unsigned long arg4, unsigned long arg5)
+{
+        return -EINVAL;
+}
+#endif
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
 {
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        else
                                error = PR_MCE_KILL_DEFAULT;
                        break;
+                case PR_SET_MM:
+                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
+                        break;
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+        {
+                .procname       = "panic_on_stackoverflow",
+                .data           = &sysctl_panic_on_stackoverflow,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+#endif
        {
                .procname       = "bootloader_type",
                .data           = &bootloader_type,
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b26c2228fe92..2cf9cc7aa103 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -25,7 +25,7 @@ config HIGH_RES_TIMERS
 config GENERIC_CLOCKEVENTS_BUILD
        bool
        default y
-        depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
+        depends on GENERIC_CLOCKEVENTS
 config GENERIC_CLOCKEVENTS_MIN_ADJUST
        bool
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ecd6ba36d6c..9cd928f7a7c6 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/sysdev.h>
 #include "tick-internal.h"
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d3ad022136e5..a45ca167ab24 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
 *   o Allow clocksource drivers to be unregistered
 */
+#include <linux/device.h>
 #include <linux/clocksource.h>
-#include <linux/sysdev.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -796,8 +796,8 @@ EXPORT_SYMBOL(clocksource_unregister);
 * Provides sysfs interface for listing current clocksource.
 */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev,
+sysfs_show_current_clocksources(struct device *dev,
-                                struct sysdev_attribute *attr, char *buf)
+                                struct device_attribute *attr, char *buf)
 {
        ssize_t count = 0;
@@ -818,8 +818,8 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 * Takes input from sysfs interface for manually overriding the default
 * clocksource selection.
 */
-static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+static ssize_t sysfs_override_clocksource(struct device *dev,
-                                          struct sysdev_attribute *attr,
+                                          struct device_attribute *attr,
                                          const char *buf, size_t count)
 {
        size_t ret = count;
@@ -853,8 +853,8 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 * Provides sysfs interface for listing registered clocksources
 */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev,
+sysfs_show_available_clocksources(struct device *dev,
-                                  struct sysdev_attribute *attr,
+                                  struct device_attribute *attr,
                                  char *buf)
 {
        struct clocksource *src;
@@ -883,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 /*
 * Sysfs setup bits:
 */
-static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
                   sysfs_override_clocksource);
-static SYSDEV_ATTR(available_clocksource, 0444,
+static DEVICE_ATTR(available_clocksource, 0444,
                   sysfs_show_available_clocksources, NULL);
-static struct sysdev_class clocksource_sysclass = {
+static struct bus_type clocksource_subsys = {
        .name = "clocksource",
+        .dev_name = "clocksource",
 };
-static struct sys_device device_clocksource = {
+static struct device device_clocksource = {
        .id     = 0,
-        .cls    = &clocksource_sysclass,
+        .bus    = &clocksource_subsys,
 };
 static int __init init_clocksource_sysfs(void)
 {
-        int error = sysdev_class_register(&clocksource_sysclass);
+        int error = subsys_system_register(&clocksource_subsys, NULL);
        if (!error)
-                error = sysdev_register(&device_clocksource);
+                error = device_register(&device_clocksource);
        if (!error)
-                error = sysdev_create_file(
+                error = device_create_file(
                                &device_clocksource,
-                                &attr_current_clocksource);
+                                &dev_attr_current_clocksource);
        if (!error)
-                error = sysdev_create_file(
+                error = device_create_file(
                                &device_clocksource,
-                                &attr_available_clocksource);
+                                &dev_attr_available_clocksource);
        return error;
 }
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 16fc34a0806f..cdea7b56b0c9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
 static struct dentry *blk_create_buf_file_callback(const char *filename,
                                                   struct dentry *parent,
-                                                   int mode,
+                                                   umode_t mode,
                                                   struct rchan_buf *buf,
                                                   int *is_global)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1e8943fed1d..683d559a0eef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,11 +22,13 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
+#include <linux/bsearch.h>
 #include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#include <linux/sort.h>
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/rcupdate.h>
@@ -947,13 +949,6 @@ struct ftrace_func_probe {
        struct rcu_head         rcu;
 };
-enum {
-        FTRACE_ENABLE_CALLS             = (1 << 0),
-        FTRACE_DISABLE_CALLS            = (1 << 1),
-        FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
-        FTRACE_START_FUNC_RET           = (1 << 3),
-        FTRACE_STOP_FUNC_RET            = (1 << 4),
-};
 struct ftrace_func_entry {
        struct hlist_node hlist;
        unsigned long ip;
@@ -984,18 +979,19 @@ static struct ftrace_ops global_ops = {
        .filter_hash            = EMPTY_HASH,
 };
-static struct dyn_ftrace *ftrace_new_addrs;
 static DEFINE_MUTEX(ftrace_regex_lock);
 struct ftrace_page {
        struct ftrace_page      *next;
+        struct dyn_ftrace       *records;
        int                     index;
-        struct dyn_ftrace       records[];
+        int                     size;
 };
-#define ENTRIES_PER_PAGE \
+static struct ftrace_page *ftrace_new_pgs;
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+#define ENTRY_SIZE sizeof(struct dyn_ftrace)
+#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
 /* estimate from running different kernels */
 #define NR_TO_INIT              10000
@@ -1003,7 +999,10 @@ struct ftrace_page {
 static struct ftrace_page       *ftrace_pages_start;
 static struct ftrace_page       *ftrace_pages;
-static struct dyn_ftrace *ftrace_free_records;
+static bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+        return !hash || !hash->count;
+}
 static struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1013,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
        struct hlist_head *hhd;
        struct hlist_node *n;
-        if (!hash->count)
+        if (ftrace_hash_empty(hash))
                return NULL;
        if (hash->size_bits > 0)
@@ -1157,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
                return NULL;
        /* Empty hash? */
-        if (!hash || !hash->count)
+        if (ftrace_hash_empty(hash))
                return new_hash;
        size = 1 << hash->size_bits;
@@ -1282,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
        filter_hash = rcu_dereference_raw(ops->filter_hash);
        notrace_hash = rcu_dereference_raw(ops->notrace_hash);
-        if ((!filter_hash || !filter_hash->count ||
+        if ((ftrace_hash_empty(filter_hash) ||
             ftrace_lookup_ip(filter_hash, ip)) &&
-            (!notrace_hash || !notrace_hash->count ||
+            (ftrace_hash_empty(notrace_hash) ||
             !ftrace_lookup_ip(notrace_hash, ip)))
                ret = 1;
        else
@@ -1307,6 +1306,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
                }                               \
        }
+static int ftrace_cmp_recs(const void *a, const void *b)
+{
+        const struct dyn_ftrace *reca = a;
+        const struct dyn_ftrace *recb = b;
+        if (reca->ip > recb->ip)
+                return 1;
+        if (reca->ip < recb->ip)
+                return -1;
+        return 0;
+}
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns 1 if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_location(unsigned long ip)
+{
+        struct ftrace_page *pg;
+        struct dyn_ftrace *rec;
+        struct dyn_ftrace key;
+        key.ip = ip;
+        for (pg = ftrace_pages_start; pg; pg = pg->next) {
+                rec = bsearch(&key, pg->records, pg->index,
+                              sizeof(struct dyn_ftrace),
+                              ftrace_cmp_recs);
+                if (rec)
+                        return 1;
+        }
+        return 0;
+}
 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                                     int filter_hash,
                                     bool inc)
@@ -1336,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
        if (filter_hash) {
                hash = ops->filter_hash;
                other_hash = ops->notrace_hash;
-                if (!hash || !hash->count)
+                if (ftrace_hash_empty(hash))
                        all = 1;
        } else {
                inc = !inc;
@@ -1346,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                 * If the notrace hash has no items,
                 * then there's nothing to do.
                 */
-                if (hash && !hash->count)
+                if (ftrace_hash_empty(hash))
                        return;
        }
@@ -1363,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                        if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
                                match = 1;
                } else {
-                        in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
+                        in_hash = !!ftrace_lookup_ip(hash, rec->ip);
-                        in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
+                        in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
                        /*
                         *
@@ -1372,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                        if (filter_hash && in_hash && !in_other_hash)
                                match = 1;
                        else if (!filter_hash && in_hash &&
-                                 (in_other_hash || !other_hash->count))
+                                 (in_other_hash || ftrace_hash_empty(other_hash)))
                                match = 1;
                }
                if (!match)
@@ -1406,40 +1446,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
        __ftrace_hash_rec_update(ops, filter_hash, 1);
 }
-static void ftrace_free_rec(struct dyn_ftrace *rec)
-{
-        rec->freelist = ftrace_free_records;
-        ftrace_free_records = rec;
-        rec->flags |= FTRACE_FL_FREE;
-}
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-        struct dyn_ftrace *rec;
+        if (ftrace_pages->index == ftrace_pages->size) {
+                /* We should have allocated enough */
-        /* First check for freed records */
+                if (WARN_ON(!ftrace_pages->next))
-        if (ftrace_free_records) {
-                rec = ftrace_free_records;
-                if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
-                        FTRACE_WARN_ON_ONCE(1);
-                        ftrace_free_records = NULL;
                        return NULL;
-                }
-                ftrace_free_records = rec->freelist;
-                memset(rec, 0, sizeof(*rec));
-                return rec;
-        }
-        if (ftrace_pages->index == ENTRIES_PER_PAGE) {
-                if (!ftrace_pages->next) {
-                        /* allocate another page */
-                        ftrace_pages->next =
-                                (void *)get_zeroed_page(GFP_KERNEL);
-                        if (!ftrace_pages->next)
-                                return NULL;
-                }
                ftrace_pages = ftrace_pages->next;
        }
@@ -1459,8 +1471,6 @@ ftrace_record_ip(unsigned long ip)
                return NULL;
        rec->ip = ip;
-        rec->newlist = ftrace_new_addrs;
-        ftrace_new_addrs = rec;
        return rec;
 }
@@ -1475,7 +1485,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
                printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
 }
-static void ftrace_bug(int failed, unsigned long ip)
+/**
+ * ftrace_bug - report and shutdown function tracer
+ * @failed: The failed type (EFAULT, EINVAL, EPERM)
+ * @ip: The address that failed
+ *
+ * The arch code that enables or disables the function tracing
+ * can call ftrace_bug() when it has detected a problem in
+ * modifying the code. @failed should be one of either:
+ * EFAULT - if the problem happens on reading the @ip address
+ * EINVAL - if what is read at @ip is not what was expected
+ * EPERM - if the problem happens on writting to the @ip address
+ */
+void ftrace_bug(int failed, unsigned long ip)
 {
        switch (failed) {
        case -EFAULT:
@@ -1517,24 +1539,19 @@ int ftrace_text_reserved(void *start, void *end)
        return 0;
 }
+static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
-static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
-        unsigned long ftrace_addr;
        unsigned long flag = 0UL;
-        ftrace_addr = (unsigned long)FTRACE_ADDR;
        /*
-         * If we are enabling tracing:
+         * If we are updating calls:
         *
         *   If the record has a ref count, then we need to enable it
         *   because someone is using it.
         *
         *   Otherwise we make sure its disabled.
         *
-         * If we are disabling tracing, then disable all records that
+         * If we are disabling calls, then disable all records that
         * are enabled.
         */
        if (enable && (rec->flags & ~FTRACE_FL_MASK))
@@ -1542,18 +1559,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
        /* If the state of this record hasn't changed, then do nothing */
        if ((rec->flags & FTRACE_FL_ENABLED) == flag)
-                return 0;
+                return FTRACE_UPDATE_IGNORE;
        if (flag) {
-                rec->flags |= FTRACE_FL_ENABLED;
+                if (update)
+                        rec->flags |= FTRACE_FL_ENABLED;
+                return FTRACE_UPDATE_MAKE_CALL;
+        }
+        if (update)
+                rec->flags &= ~FTRACE_FL_ENABLED;
+        return FTRACE_UPDATE_MAKE_NOP;
+}
+/**
+ * ftrace_update_record, set a record that now is tracing or not
+ * @rec: the record to update
+ * @enable: set to 1 if the record is tracing, zero to force disable
+ *
+ * The records that represent all functions that can be traced need
+ * to be updated when tracing has been enabled.
+ */
+int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+{
+        return ftrace_check_record(rec, enable, 1);
+}
+/**
+ * ftrace_test_record, check if the record has been enabled or not
+ * @rec: the record to test
+ * @enable: set to 1 to check if enabled, 0 if it is disabled
+ *
+ * The arch code may need to test if a record is already set to
+ * tracing to determine how to modify the function code that it
+ * represents.
+ */
+int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+{
+        return ftrace_check_record(rec, enable, 0);
+}
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+        unsigned long ftrace_addr;
+        int ret;
+        ftrace_addr = (unsigned long)FTRACE_ADDR;
+        ret = ftrace_update_record(rec, enable);
+        switch (ret) {
+        case FTRACE_UPDATE_IGNORE:
+                return 0;
+        case FTRACE_UPDATE_MAKE_CALL:
                return ftrace_make_call(rec, ftrace_addr);
+        case FTRACE_UPDATE_MAKE_NOP:
+                return ftrace_make_nop(NULL, rec, ftrace_addr);
        }
-        rec->flags &= ~FTRACE_FL_ENABLED;
+        return -1; /* unknow ftrace bug */
-        return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
-static void ftrace_replace_code(int enable)
+static void ftrace_replace_code(int update)
 {
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
@@ -1563,11 +1634,7 @@ static void ftrace_replace_code(int enable)
                return;
        do_for_each_ftrace_rec(pg, rec) {
-                /* Skip over free records */
+                failed = __ftrace_replace_code(rec, update);
-                if (rec->flags & FTRACE_FL_FREE)
-                        continue;
-                failed = __ftrace_replace_code(rec, enable);
                if (failed) {
                        ftrace_bug(failed, rec->ip);
                        /* Stop processing */
@@ -1576,6 +1643,78 @@ static void ftrace_replace_code(int enable)
        } while_for_each_ftrace_rec();
 }
+struct ftrace_rec_iter {
+        struct ftrace_page      *pg;
+        int                     index;
+};
+/**
+ * ftrace_rec_iter_start, start up iterating over traced functions
+ *
+ * Returns an iterator handle that is used to iterate over all
+ * the records that represent address locations where functions
+ * are traced.
+ *
+ * May return NULL if no records are available.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_start(void)
+{
+        /*
+         * We only use a single iterator.
+         * Protected by the ftrace_lock mutex.
+         */
+        static struct ftrace_rec_iter ftrace_rec_iter;
+        struct ftrace_rec_iter *iter = &ftrace_rec_iter;
+        iter->pg = ftrace_pages_start;
+        iter->index = 0;
+        /* Could have empty pages */
+        while (iter->pg && !iter->pg->index)
+                iter->pg = iter->pg->next;
+        if (!iter->pg)
+                return NULL;
+        return iter;
+}
+/**
+ * ftrace_rec_iter_next, get the next record to process.
+ * @iter: The handle to the iterator.
+ *
+ * Returns the next iterator after the given iterator @iter.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
+{
+        iter->index++;
+        if (iter->index >= iter->pg->index) {
+                iter->pg = iter->pg->next;
+                iter->index = 0;
+                /* Could have empty pages */
+                while (iter->pg && !iter->pg->index)
+                        iter->pg = iter->pg->next;
+        }
+        if (!iter->pg)
+                return NULL;
+        return iter;
+}
+/**
+ * ftrace_rec_iter_record, get the record at the iterator location
+ * @iter: The current iterator location
+ *
+ * Returns the record that the current @iter is at.
+ */
+struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
+{
+        return &iter->pg->records[iter->index];
+}
 static int
 ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
@@ -1617,13 +1756,7 @@ static int __ftrace_modify_code(void *data)
 {
        int *command = data;
-        /*
+        if (*command & FTRACE_UPDATE_CALLS)
-         * Do not call function tracer while we update the code.
-         * We are in stop machine, no worrying about races.
-         */
-        function_trace_stop++;
-        if (*command & FTRACE_ENABLE_CALLS)
                ftrace_replace_code(1);
        else if (*command & FTRACE_DISABLE_CALLS)
                ftrace_replace_code(0);
@@ -1636,21 +1769,33 @@ static int __ftrace_modify_code(void *data)
        else if (*command & FTRACE_STOP_FUNC_RET)
                ftrace_disable_ftrace_graph_caller();
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-        /*
-         * For archs that call ftrace_test_stop_func(), we must
-         * wait till after we update all the function callers
-         * before we update the callback. This keeps different
-         * ops that record different functions from corrupting
-         * each other.
-         */
-        __ftrace_trace_function = __ftrace_trace_function_delay;
-#endif
-        function_trace_stop--;
        return 0;
 }
+/**
+ * ftrace_run_stop_machine, go back to the stop machine method
+ * @command: The command to tell ftrace what to do
+ *
+ * If an arch needs to fall back to the stop machine method, the
+ * it can call this function.
+ */
+void ftrace_run_stop_machine(int command)
+{
+        stop_machine(__ftrace_modify_code, &command, NULL);
+}
+/**
+ * arch_ftrace_update_code, modify the code to trace or not trace
+ * @command: The command that needs to be done
+ *
+ * Archs can override this function if it does not need to
+ * run stop_machine() to modify code.
+ */
+void __weak arch_ftrace_update_code(int command)
+{
+        ftrace_run_stop_machine(command);
+}
 static void ftrace_run_update_code(int command)
 {
        int ret;
@@ -1659,8 +1804,31 @@ static void ftrace_run_update_code(int command)
        FTRACE_WARN_ON(ret);
        if (ret)
                return;
+        /*
+         * Do not call function tracer while we update the code.
+         * We are in stop machine.
+         */
+        function_trace_stop++;
-        stop_machine(__ftrace_modify_code, &command, NULL);
+        /*
+         * By default we use stop_machine() to modify the code.
+         * But archs can do what ever they want as long as it
+         * is safe. The stop_machine() is the safest, but also
+         * produces the most overhead.
+         */
+        arch_ftrace_update_code(command);
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        /*
+         * For archs that call ftrace_test_stop_func(), we must
+         * wait till after we update all the function callers
+         * before we update the callback. This keeps different
+         * ops that record different functions from corrupting
+         * each other.
+         */
+        __ftrace_trace_function = __ftrace_trace_function_delay;
+#endif
+        function_trace_stop--;
        ret = ftrace_arch_code_modify_post_process();
        FTRACE_WARN_ON(ret);
@@ -1691,7 +1859,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
                return -ENODEV;
        ftrace_start_up++;
-        command |= FTRACE_ENABLE_CALLS;
+        command |= FTRACE_UPDATE_CALLS;
        /* ops marked global share the filter hashes */
        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1743,8 +1911,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
        if (ops != &global_ops || !global_start_up)
                ops->flags &= ~FTRACE_OPS_FL_ENABLED;
-        if (!ftrace_start_up)
+        command |= FTRACE_UPDATE_CALLS;
-                command |= FTRACE_DISABLE_CALLS;
        if (saved_ftrace_func != ftrace_trace_function) {
                saved_ftrace_func = ftrace_trace_function;
@@ -1766,7 +1933,7 @@ static void ftrace_startup_sysctl(void)
        saved_ftrace_func = NULL;
        /* ftrace_start_up is true if we want ftrace running */
        if (ftrace_start_up)
-                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+                ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 }
 static void ftrace_shutdown_sysctl(void)
@@ -1788,14 +1955,16 @@ static int ops_traces_mod(struct ftrace_ops *ops)
        struct ftrace_hash *hash;
        hash = ops->filter_hash;
-        return !!(!hash || !hash->count);
+        return ftrace_hash_empty(hash);
 }
 static int ftrace_update_code(struct module *mod)
 {
+        struct ftrace_page *pg;
        struct dyn_ftrace *p;
        cycle_t start, stop;
        unsigned long ref = 0;
+        int i;
        /*
         * When adding a module, we need to check if tracers are
@@ -1817,46 +1986,44 @@ static int ftrace_update_code(struct module *mod)
        start = ftrace_now(raw_smp_processor_id());
        ftrace_update_cnt = 0;
-        while (ftrace_new_addrs) {
+        for (pg = ftrace_new_pgs; pg; pg = pg->next) {
-                /* If something went wrong, bail without enabling anything */
+                for (i = 0; i < pg->index; i++) {
-                if (unlikely(ftrace_disabled))
+                        /* If something went wrong, bail without enabling anything */
-                        return -1;
+                        if (unlikely(ftrace_disabled))
+                                return -1;
-                p = ftrace_new_addrs;
+                        p = &pg->records[i];
-                ftrace_new_addrs = p->newlist;
+                        p->flags = ref;
-                p->flags = ref;
-                /*
+                        /*
-                 * Do the initial record conversion from mcount jump
+                         * Do the initial record conversion from mcount jump
-                 * to the NOP instructions.
+                         * to the NOP instructions.
-                 */
+                         */
-                if (!ftrace_code_disable(mod, p)) {
+                        if (!ftrace_code_disable(mod, p))
-                        ftrace_free_rec(p);
+                                break;
-                        /* Game over */
-                        break;
-                }
-                ftrace_update_cnt++;
+                        ftrace_update_cnt++;
-                /*
+                        /*
-                 * If the tracing is enabled, go ahead and enable the record.
+                         * If the tracing is enabled, go ahead and enable the record.
-                 *
+                         *
-                 * The reason not to enable the record immediatelly is the
+                         * The reason not to enable the record immediatelly is the
-                 * inherent check of ftrace_make_nop/ftrace_make_call for
+                         * inherent check of ftrace_make_nop/ftrace_make_call for
-                 * correct previous instructions.  Making first the NOP
+                         * correct previous instructions.  Making first the NOP
-                 * conversion puts the module to the correct state, thus
+                         * conversion puts the module to the correct state, thus
-                 * passing the ftrace_make_call check.
+                         * passing the ftrace_make_call check.
-                 */
+                         */
-                if (ftrace_start_up && ref) {
+                        if (ftrace_start_up && ref) {
-                        int failed = __ftrace_replace_code(p, 1);
+                                int failed = __ftrace_replace_code(p, 1);
-                        if (failed) {
+                                if (failed)
-                                ftrace_bug(failed, p->ip);
+                                        ftrace_bug(failed, p->ip);
-                                ftrace_free_rec(p);
                        }
                }
        }
+        ftrace_new_pgs = NULL;
        stop = ftrace_now(raw_smp_processor_id());
        ftrace_update_time = stop - start;
        ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -1864,57 +2031,108 @@ static int ftrace_update_code(struct module *mod)
        return 0;
 }
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+static int ftrace_allocate_records(struct ftrace_page *pg, int count)
 {
-        struct ftrace_page *pg;
+        int order;
        int cnt;
-        int i;
-        /* allocate a few pages */
+        if (WARN_ON(!count))
-        ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+                return -EINVAL;
-        if (!ftrace_pages_start)
-                return -1;
+        order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
        /*
-         * Allocate a few more pages.
+         * We want to fill as much as possible. No more than a page
-         *
+         * may be empty.
-         * TODO: have some parser search vmlinux before
-         *   final linking to find all calls to ftrace.
-         *   Then we can:
-         *    a) know how many pages to allocate.
-         *     and/or
-         *    b) set up the table then.
-         *
-         *  The dynamic code is still necessary for
-         *  modules.
         */
+        while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
+                order--;
-        pg = ftrace_pages = ftrace_pages_start;
+ again:
+        pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-        cnt = num_to_init / ENTRIES_PER_PAGE;
+        if (!pg->records) {
-        pr_info("ftrace: allocating %ld entries in %d pages\n",
+                /* if we can't allocate this size, try something smaller */
-                num_to_init, cnt + 1);
+                if (!order)
+                        return -ENOMEM;
+                order >>= 1;
+                goto again;
+        }
-        for (i = 0; i < cnt; i++) {
+        cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
-                pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+        pg->size = cnt;
-                /* If we fail, we'll try later anyway */
+        if (cnt > count)
-                if (!pg->next)
+                cnt = count;
+        return cnt;
+}
+static struct ftrace_page *
+ftrace_allocate_pages(unsigned long num_to_init)
+{
+        struct ftrace_page *start_pg;
+        struct ftrace_page *pg;
+        int order;
+        int cnt;
+        if (!num_to_init)
+                return 0;
+        start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+        if (!pg)
+                return NULL;
+        /*
+         * Try to allocate as much as possible in one continues
+         * location that fills in all of the space. We want to
+         * waste as little space as possible.
+         */
+        for (;;) {
+                cnt = ftrace_allocate_records(pg, num_to_init);
+                if (cnt < 0)
+                        goto free_pages;
+                num_to_init -= cnt;
+                if (!num_to_init)
                        break;
+                pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
+                if (!pg->next)
+                        goto free_pages;
                pg = pg->next;
        }
-        return 0;
+        return start_pg;
+ free_pages:
+        while (start_pg) {
+                order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+                free_pages((unsigned long)pg->records, order);
+                start_pg = pg->next;
+                kfree(pg);
+                pg = start_pg;
+        }
+        pr_info("ftrace: FAILED to allocate memory for functions\n");
+        return NULL;
 }
-enum {
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
-        FTRACE_ITER_FILTER      = (1 << 0),
+{
-        FTRACE_ITER_NOTRACE     = (1 << 1),
+        int cnt;
-        FTRACE_ITER_PRINTALL    = (1 << 2),
-        FTRACE_ITER_HASH        = (1 << 3),
+        if (!num_to_init) {
-        FTRACE_ITER_ENABLED     = (1 << 4),
+                pr_info("ftrace: No functions to be traced?\n");
-};
+                return -1;
+        }
+        cnt = num_to_init / ENTRIES_PER_PAGE;
+        pr_info("ftrace: allocating %ld entries in %d pages\n",
+                num_to_init, cnt + 1);
+        return 0;
+}
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1980,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
        void *p = NULL;
        loff_t l;
+        if (!(iter->flags & FTRACE_ITER_DO_HASH))
+                return NULL;
        if (iter->func_pos > *pos)
                return NULL;
@@ -2023,7 +2244,7 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct ftrace_ops *ops = &global_ops;
+        struct ftrace_ops *ops = iter->ops;
        struct dyn_ftrace *rec = NULL;
        if (unlikely(ftrace_disabled))
@@ -2047,9 +2268,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                }
        } else {
                rec = &iter->pg->records[iter->idx++];
-                if ((rec->flags & FTRACE_FL_FREE) ||
+                if (((iter->flags & FTRACE_ITER_FILTER) &&
-                    ((iter->flags & FTRACE_ITER_FILTER) &&
                     !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
                    ((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2081,7 +2300,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct ftrace_ops *ops = &global_ops;
+        struct ftrace_ops *ops = iter->ops;
        void *p = NULL;
        loff_t l;
@@ -2101,7 +2320,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         * off, we can short cut and just print out that all
         * functions are enabled.
         */
-        if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
+        if (iter->flags & FTRACE_ITER_FILTER &&
+            ftrace_hash_empty(ops->filter_hash)) {
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2126,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                        break;
        }
-        if (!p) {
+        if (!p)
-                if (iter->flags & FTRACE_ITER_FILTER)
+                return t_hash_start(m, pos);
-                        return t_hash_start(m, pos);
-                return NULL;
-        }
        return iter;
 }
@@ -2189,6 +2405,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        iter->pg = ftrace_pages_start;
+        iter->ops = &global_ops;
        ret = seq_open(file, &show_ftrace_seq_ops);
        if (!ret) {
@@ -2217,6 +2434,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
        iter->pg = ftrace_pages_start;
        iter->flags = FTRACE_ITER_ENABLED;
+        iter->ops = &global_ops;
        ret = seq_open(file, &show_ftrace_seq_ops);
        if (!ret) {
@@ -2237,7 +2455,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
        mutex_unlock(&ftrace_lock);
 }
-static int
+/**
+ * ftrace_regex_open - initialize function tracer filter files
+ * @ops: The ftrace_ops that hold the hash filters
+ * @flag: The type of filter to process
+ * @inode: The inode, usually passed in to your open routine
+ * @file: The file, usually passed in to your open routine
+ *
+ * ftrace_regex_open() initializes the filter files for the
+ * @ops. Depending on @flag it may process the filter hash or
+ * the notrace hash of @ops. With this called from the open
+ * routine, you can use ftrace_filter_write() for the write
+ * routine if @flag has FTRACE_ITER_FILTER set, or
+ * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
+ * ftrace_regex_lseek() should be used as the lseek routine, and
+ * release must call ftrace_regex_release().
+ */
+int
 ftrace_regex_open(struct ftrace_ops *ops, int flag,
                  struct inode *inode, struct file *file)
 {
@@ -2306,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 static int
 ftrace_filter_open(struct inode *inode, struct file *file)
 {
-        return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
+        return ftrace_regex_open(&global_ops,
-                                 inode, file);
+                        FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+                        inode, file);
 }
 static int
@@ -2317,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
                                 inode, file);
 }
-static loff_t
+loff_t
 ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
        loff_t ret;
@@ -2426,7 +2661,6 @@ match_records(struct ftrace_hash *hash, char *buff,
                goto out_unlock;
        do_for_each_ftrace_rec(pg, rec) {
                if (ftrace_match_record(rec, mod, search, search_len, type)) {
                        ret = enter_record(hash, rec, not);
                        if (ret < 0) {
@@ -2871,14 +3105,14 @@ out_unlock:
        return ret;
 }
-static ssize_t
+ssize_t
 ftrace_filter_write(struct file *file, const char __user *ubuf,
                    size_t cnt, loff_t *ppos)
 {
        return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
 }
-static ssize_t
+ssize_t
 ftrace_notrace_write(struct file *file, const char __user *ubuf,
                     size_t cnt, loff_t *ppos)
 {
@@ -2919,7 +3153,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
        if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
            && ftrace_enabled)
-                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+                ftrace_run_update_code(FTRACE_UPDATE_CALLS);
        mutex_unlock(&ftrace_lock);
@@ -3045,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-static void __init
+void __init
-set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
+ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 {
        char *func;
@@ -3059,17 +3293,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 static void __init set_ftrace_early_filters(void)
 {
        if (ftrace_filter_buf[0])
-                set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
+                ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
        if (ftrace_notrace_buf[0])
-                set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
+                ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        if (ftrace_graph_buf[0])
                set_ftrace_early_graph(ftrace_graph_buf);
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 }
-static int
+int ftrace_regex_release(struct inode *inode, struct file *file)
-ftrace_regex_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = (struct seq_file *)file->private_data;
        struct ftrace_iterator *iter;
@@ -3107,7 +3340,7 @@ ftrace_regex_release(struct inode *inode, struct file *file)
                                       orig_hash, iter->hash);
                if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
                    && ftrace_enabled)
-                        ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+                        ftrace_run_update_code(FTRACE_UPDATE_CALLS);
                mutex_unlock(&ftrace_lock);
        }
@@ -3270,9 +3503,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FREE)
-                        continue;
                if (ftrace_match_record(rec, NULL, search, search_len, type)) {
                        /* if it is in the array */
                        exists = false;
@@ -3381,15 +3611,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
        return 0;
 }
+static void ftrace_swap_recs(void *a, void *b, int size)
+{
+        struct dyn_ftrace *reca = a;
+        struct dyn_ftrace *recb = b;
+        struct dyn_ftrace t;
+        t = *reca;
+        *reca = *recb;
+        *recb = t;
+}
 static int ftrace_process_locs(struct module *mod,
                               unsigned long *start,
                               unsigned long *end)
 {
+        struct ftrace_page *pg;
+        unsigned long count;
        unsigned long *p;
        unsigned long addr;
        unsigned long flags = 0; /* Shut up gcc */
+        int ret = -ENOMEM;
+        count = end - start;
+        if (!count)
+                return 0;
+        pg = ftrace_allocate_pages(count);
+        if (!pg)
+                return -ENOMEM;
        mutex_lock(&ftrace_lock);
+        /*
+         * Core and each module needs their own pages, as
+         * modules will free them when they are removed.
+         * Force a new page to be allocated for modules.
+         */
+        if (!mod) {
+                WARN_ON(ftrace_pages || ftrace_pages_start);
+                /* First initialization */
+                ftrace_pages = ftrace_pages_start = pg;
+        } else {
+                if (!ftrace_pages)
+                        goto out;
+                if (WARN_ON(ftrace_pages->next)) {
+                        /* Hmm, we have free pages? */
+                        while (ftrace_pages->next)
+                                ftrace_pages = ftrace_pages->next;
+                }
+                ftrace_pages->next = pg;
+                ftrace_pages = pg;
+        }
        p = start;
        while (p < end) {
                addr = ftrace_call_adjust(*p++);
@@ -3401,9 +3678,18 @@ static int ftrace_process_locs(struct module *mod,
                 */
                if (!addr)
                        continue;
-                ftrace_record_ip(addr);
+                if (!ftrace_record_ip(addr))
+                        break;
        }
+        /* These new locations need to be initialized */
+        ftrace_new_pgs = pg;
+        /* Make each individual set of pages sorted by ips */
+        for (; pg; pg = pg->next)
+                sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
+                     ftrace_cmp_recs, ftrace_swap_recs);
        /*
         * We only need to disable interrupts on start up
         * because we are modifying code that an interrupt
@@ -3417,32 +3703,55 @@ static int ftrace_process_locs(struct module *mod,
        ftrace_update_code(mod);
        if (!mod)
                local_irq_restore(flags);
+        ret = 0;
+ out:
        mutex_unlock(&ftrace_lock);
-        return 0;
+        return ret;
 }
 #ifdef CONFIG_MODULES
+#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
 void ftrace_release_mod(struct module *mod)
 {
        struct dyn_ftrace *rec;
+        struct ftrace_page **last_pg;
        struct ftrace_page *pg;
+        int order;
        mutex_lock(&ftrace_lock);
        if (ftrace_disabled)
                goto out_unlock;
-        do_for_each_ftrace_rec(pg, rec) {
+        /*
+         * Each module has its own ftrace_pages, remove
+         * them from the list.
+         */
+        last_pg = &ftrace_pages_start;
+        for (pg = ftrace_pages_start; pg; pg = *last_pg) {
+                rec = &pg->records[0];
                if (within_module_core(rec->ip, mod)) {
                        /*
-                         * rec->ip is changed in ftrace_free_rec()
+                         * As core pages are first, the first
-                         * It should not between s and e if record was freed.
+                         * page should never be a module page.
                         */
-                        FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+                        if (WARN_ON(pg == ftrace_pages_start))
-                        ftrace_free_rec(rec);
+                                goto out_unlock;
-                }
-        } while_for_each_ftrace_rec();
+                        /* Check if we are deleting the last page */
+                        if (pg == ftrace_pages)
+                                ftrace_pages = next_to_ftrace_page(last_pg);
+                        *last_pg = pg->next;
+                        order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+                        free_pages((unsigned long)pg->records, order);
+                        kfree(pg);
+                } else
+                        last_pg = &pg->next;
+        }
 out_unlock:
        mutex_unlock(&ftrace_lock);
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 91dc4bc8bf72..a3f1bc5d2a00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4438,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = {
 };
 struct dentry *trace_create_file(const char *name,
-                                 mode_t mode,
+                                 umode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2c2657462ac3..b93ecbadad6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
 void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
-                                 mode_t mode,
+                                 umode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f04cc3136bd3..24aee7127451 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system,
        return -ENOMEM;
 }
+static int create_filter_start(char *filter_str, bool set_str,
+                               struct filter_parse_state **psp,
+                               struct event_filter **filterp)
+{
+        struct event_filter *filter;
+        struct filter_parse_state *ps = NULL;
+        int err = 0;
+        WARN_ON_ONCE(*psp || *filterp);
+        /* allocate everything, and if any fails, free all and fail */
+        filter = __alloc_filter();
+        if (filter && set_str)
+                err = replace_filter_string(filter, filter_str);
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (!filter || !ps || err) {
+                kfree(ps);
+                __free_filter(filter);
+                return -ENOMEM;
+        }
+        /* we're committed to creating a new filter */
+        *filterp = filter;
+        *psp = ps;
+        parse_init(ps, filter_ops, filter_str);
+        err = filter_parse(ps);
+        if (err && set_str)
+                append_filter_err(ps, filter);
+        return err;
+}
+static void create_filter_finish(struct filter_parse_state *ps)
+{
+        if (ps) {
+                filter_opstack_clear(ps);
+                postfix_clear(ps);
+                kfree(ps);
+        }
+}
+/**
+ * create_filter - create a filter for a ftrace_event_call
+ * @call: ftrace_event_call to create a filter for
+ * @filter_str: filter string
+ * @set_str: remember @filter_str and enable detailed error in filter
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Creates a filter for @call with @filter_str.  If @set_str is %true,
+ * @filter_str is copied and recorded in the new filter.
+ *
+ * On success, returns 0 and *@filterp points to the new filter.  On
+ * failure, returns -errno and *@filterp may point to %NULL or to a new
+ * filter.  In the latter case, the returned filter contains error
+ * information if @set_str is %true and the caller is responsible for
+ * freeing it.
+ */
+static int create_filter(struct ftrace_event_call *call,
+                         char *filter_str, bool set_str,
+                         struct event_filter **filterp)
+{
+        struct event_filter *filter = NULL;
+        struct filter_parse_state *ps = NULL;
+        int err;
+        err = create_filter_start(filter_str, set_str, &ps, &filter);
+        if (!err) {
+                err = replace_preds(call, filter, ps, filter_str, false);
+                if (err && set_str)
+                        append_filter_err(ps, filter);
+        }
+        create_filter_finish(ps);
+        *filterp = filter;
+        return err;
+}
+/**
+ * create_system_filter - create a filter for an event_subsystem
+ * @system: event_subsystem to create a filter for
+ * @filter_str: filter string
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Identical to create_filter() except that it creates a subsystem filter
+ * and always remembers @filter_str.
+ */
+static int create_system_filter(struct event_subsystem *system,
+                                char *filter_str, struct event_filter **filterp)
+{
+        struct event_filter *filter = NULL;
+        struct filter_parse_state *ps = NULL;
+        int err;
+        err = create_filter_start(filter_str, true, &ps, &filter);
+        if (!err) {
+                err = replace_system_preds(system, ps, filter_str);
+                if (!err) {
+                        /* System filters just show a default message */
+                        kfree(filter->filter_string);
+                        filter->filter_string = NULL;
+                } else {
+                        append_filter_err(ps, filter);
+                }
+        }
+        create_filter_finish(ps);
+        *filterp = filter;
+        return err;
+}
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
-        struct filter_parse_state *ps;
        struct event_filter *filter;
-        struct event_filter *tmp;
        int err = 0;
        mutex_lock(&event_mutex);
@@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                goto out_unlock;
        }
-        err = -ENOMEM;
+        err = create_filter(call, filter_string, true, &filter);
-        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-        if (!ps)
-                goto out_unlock;
-        filter = __alloc_filter();
-        if (!filter) {
-                kfree(ps);
-                goto out_unlock;
-        }
-        replace_filter_string(filter, filter_string);
-        parse_init(ps, filter_ops, filter_string);
-        err = filter_parse(ps);
-        if (err) {
-                append_filter_err(ps, filter);
-                goto out;
-        }
-        err = replace_preds(call, filter, ps, filter_string, false);
-        if (err) {
-                filter_disable(call);
-                append_filter_err(ps, filter);
-        } else
-                call->flags |= TRACE_EVENT_FL_FILTERED;
-out:
        /*
         * Always swap the call filter with the new filter
         * even if there was an error. If there was an error
         * in the filter, we disable the filter and show the error
         * string
         */
-        tmp = call->filter;
+        if (filter) {
-        rcu_assign_pointer(call->filter, filter);
+                struct event_filter *tmp = call->filter;
-        if (tmp) {
-                /* Make sure the call is done with the filter */
+                if (!err)
-                synchronize_sched();
+                        call->flags |= TRACE_EVENT_FL_FILTERED;
-                __free_filter(tmp);
+                else
+                        filter_disable(call);
+                rcu_assign_pointer(call->filter, filter);
+                if (tmp) {
+                        /* Make sure the call is done with the filter */
+                        synchronize_sched();
+                        __free_filter(tmp);
+                }
        }
-        filter_opstack_clear(ps);
-        postfix_clear(ps);
-        kfree(ps);
 out_unlock:
        mutex_unlock(&event_mutex);
@@ -1811,7 +1902,6 @@ out_unlock:
 int apply_subsystem_event_filter(struct event_subsystem *system,
                                 char *filter_string)
 {
-        struct filter_parse_state *ps;
        struct event_filter *filter;
        int err = 0;
@@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out_unlock;
        }
-        err = -ENOMEM;
+        err = create_system_filter(system, filter_string, &filter);
-        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (filter) {
-        if (!ps)
+                /*
-                goto out_unlock;
+                 * No event actually uses the system filter
+                 * we can free it without synchronize_sched().
-        filter = __alloc_filter();
+                 */
-        if (!filter)
+                __free_filter(system->filter);
-                goto out;
+                system->filter = filter;
+        }
-        /* System filters just show a default message */
-        kfree(filter->filter_string);
-        filter->filter_string = NULL;
-        /*
-         * No event actually uses the system filter
-         * we can free it without synchronize_sched().
-         */
-        __free_filter(system->filter);
-        system->filter = filter;
-        parse_init(ps, filter_ops, filter_string);
-        err = filter_parse(ps);
-        if (err)
-                goto err_filter;
-        err = replace_system_preds(system, ps, filter_string);
-        if (err)
-                goto err_filter;
-out:
-        filter_opstack_clear(ps);
-        postfix_clear(ps);
-        kfree(ps);
 out_unlock:
        mutex_unlock(&event_mutex);
        return err;
-err_filter:
-        replace_filter_string(filter, filter_string);
-        append_filter_err(ps, system->filter);
-        goto out;
 }
 #ifdef CONFIG_PERF_EVENTS
@@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 {
        int err;
        struct event_filter *filter;
-        struct filter_parse_state *ps;
        struct ftrace_event_call *call;
        mutex_lock(&event_mutex);
@@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        if (event->filter)
                goto out_unlock;
-        filter = __alloc_filter();
+        err = create_filter(call, filter_str, false, &filter);
-        if (!filter) {
-                err = PTR_ERR(filter);
-                goto out_unlock;
-        }
-        err = -ENOMEM;
-        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-        if (!ps)
-                goto free_filter;
-        parse_init(ps, filter_ops, filter_str);
-        err = filter_parse(ps);
-        if (err)
-                goto free_ps;
-        err = replace_preds(call, filter, ps, filter_str, false);
        if (!err)
                event->filter = filter;
+        else
-free_ps:
-        filter_opstack_clear(ps);
-        postfix_clear(ps);
-        kfree(ps);
-free_filter:
-        if (err)
                __free_filter(filter);
 out_unlock:
@@ -1954,43 +1991,6 @@ out_unlock:
 #define CREATE_TRACE_POINTS
 #include "trace_events_filter_test.h"
-static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
-                           struct event_filter **pfilter)
-{
-        struct event_filter *filter;
-        struct filter_parse_state *ps;
-        int err = -ENOMEM;
-        filter = __alloc_filter();
-        if (!filter)
-                goto out;
-        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-        if (!ps)
-                goto free_filter;
-        parse_init(ps, filter_ops, filter_str);
-        err = filter_parse(ps);
-        if (err)
-                goto free_ps;
-        err = replace_preds(call, filter, ps, filter_str, false);
-        if (!err)
-                *pfilter = filter;
- free_ps:
-        filter_opstack_clear(ps);
-        postfix_clear(ps);
-        kfree(ps);
- free_filter:
-        if (err)
-                __free_filter(filter);
- out:
-        return err;
-}
 #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
 { \
        .filter = FILTER, \
@@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void)
                struct test_filter_data_t *d = &test_filter_data[i];
                int err;
-                err = test_get_filter(d->filter, &event_ftrace_test_filter,
+                err = create_filter(&event_ftrace_test_filter, d->filter,
-                                      &filter);
+                                    false, &filter);
                if (err) {
                        printk(KERN_INFO
                               "Failed to get filter for '%s', err %d\n",
                               d->filter, err);
+                        __free_filter(filter);
                        break;
                }
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 77575b386d97..d4545f49242e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,9 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <asm/setup.h>
 #include "trace.h"
 #define STACK_TRACE_ENTRIES 500
@@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = stack_trace_call,
-        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 static ssize_t
@@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = {
        .release        = seq_release,
 };
+static int
+stack_trace_filter_open(struct inode *inode, struct file *file)
+{
+        return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
+                                 inode, file);
+}
+static const struct file_operations stack_trace_filter_fops = {
+        .open = stack_trace_filter_open,
+        .read = seq_read,
+        .write = ftrace_filter_write,
+        .llseek = ftrace_regex_lseek,
+        .release = ftrace_regex_release,
+};
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp,
@@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
        return ret;
 }
+static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
 static __init int enable_stacktrace(char *str)
 {
+        if (strncmp(str, "_filter=", 8) == 0)
+                strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
        stack_tracer_enabled = 1;
        last_stack_tracer_enabled = 1;
        return 1;
@@ -358,6 +380,12 @@ static __init int stack_trace_init(void)
        trace_create_file("stack_trace", 0444, d_tracer,
                        NULL, &stack_trace_fops);
+        trace_create_file("stack_trace_filter", 0444, d_tracer,
+                        NULL, &stack_trace_filter_fops);
+        if (stack_trace_filter_buf[0])
+                ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
        if (stack_tracer_enabled)
                register_ftrace_function(&trace_ops);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a810..bec7b5b53e03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
        int                     nr_drainers;    /* W: drain in progress */
        int                     saved_max_active; /* W: saved cwq max_active */
-        const char              *name;          /* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map      lockdep_map;
 #endif
+        char                    name[];         /* I: workqueue name */
 };
 struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
        return clamp_val(max_active, 1, lim);
 }
-struct workqueue_struct *__alloc_workqueue_key(const char *name,
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                                               unsigned int flags,
                                               int max_active,
                                               struct lock_class_key *key,
-                                               const char *lock_name)
+                                               const char *lock_name, ...)
 {
+        va_list args, args1;
        struct workqueue_struct *wq;
        unsigned int cpu;
+        size_t namelen;
+        /* determine namelen, allocate wq and format name */
+        va_start(args, lock_name);
+        va_copy(args1, args);
+        namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+        wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+        if (!wq)
+                goto err;
+        vsnprintf(wq->name, namelen, fmt, args1);
+        va_end(args);
+        va_end(args1);
        /*
         * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
                flags |= WQ_HIGHPRI;
        max_active = max_active ?: WQ_DFL_ACTIVE;
-        max_active = wq_clamp_max_active(max_active, flags, name);
+        max_active = wq_clamp_max_active(max_active, flags, wq->name);
-        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
-        if (!wq)
-                goto err;
+        /* init wq */
        wq->flags = flags;
        wq->saved_max_active = max_active;
        mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
-        wq->name = name;
        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
        INIT_LIST_HEAD(&wq->list);
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
                if (!rescuer)
                        goto err;
-                rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+                rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+                                               wq->name);
                if (IS_ERR(rescuer->task))
                        goto err;
author	H. Peter Anvin <hpa@linux.intel.com>	2012-01-19 15:56:50 -0500
committer	H. Peter Anvin <hpa@linux.intel.com>	2012-01-19 15:56:50 -0500
commit	282f445a779ed76fca9884fe377bf56a3088b208 (patch)
tree	d9abcf526baee0100672851e0a8894c19e762a39 /kernel
parent	68f30fbee19cc67849b9fa8e153ede70758afe81 (diff)
parent	90a4c0f51e8e44111a926be6f4c87af3938a79c3 (diff)