6 files changed, 115 insertions, 55 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4c62513fe19f..8b729c278b64 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
 static DEFINE_MUTEX(cgroup_root_mutex);
 /*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+/*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
 * populated with the built in subsystems, and modular subsystems are
 * registered after that. The mutable section of this array is protected by
@@ -191,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
+static int cgroup_file_release(struct inode *inode, struct file *file);
 /**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -871,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
        INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
-        schedule_work(&cgrp->destroy_work);
+        queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -2421,7 +2430,7 @@ static const struct file_operations cgroup_seqfile_operations = {
        .read = seq_read,
        .write = cgroup_file_write,
        .llseek = seq_lseek,
-        .release = single_release,
+        .release = cgroup_file_release,
 };
 static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2482,6 +2491,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
                ret = cft->release(inode, file);
        if (css->ss)
                css_put(css);
+        if (file->f_op == &cgroup_seqfile_operations)
+                single_release(inode, file);
        return ret;
 }
@@ -4249,7 +4260,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
         * css_put().  dput() requires process context which we don't have.
         */
        INIT_WORK(&css->destroy_work, css_free_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
@@ -4539,7 +4550,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
                container_of(ref, struct cgroup_subsys_state, refcnt);
        INIT_WORK(&css->destroy_work, css_killed_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 /**
@@ -5063,6 +5074,22 @@ out:
        return err;
 }
+static int __init cgroup_wq_init(void)
+{
+        /*
+         * There isn't much point in executing destruction path in
+         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
+         * Use 1 for @max_active.
+         *
+         * We would prefer to do this in cgroup_init() above, but that
+         * is called before init_workqueues(): so leave this until after.
+         */
+        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+        BUG_ON(!cgroup_destroy_wq);
+        return 0;
+}
+core_initcall(cgroup_wq_init);
 /*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6bf981e13c43..4772034b4b17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        need_loop = task_has_mempolicy(tsk) ||
                        !nodes_intersects(*newmems, tsk->mems_allowed);
-        if (need_loop)
+        if (need_loop) {
+                local_irq_disable();
                write_seqcount_begin(&tsk->mems_allowed_seq);
+        }
        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
-        if (need_loop)
+        if (need_loop) {
                write_seqcount_end(&tsk->mems_allowed_seq);
+                local_irq_enable();
+        }
        task_unlock(tsk);
 }
diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28105bb..763faf037ec1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 static inline int init_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_sinittext &&
-            addr <= (unsigned long)_einittext)
+            addr < (unsigned long)_einittext)
                return 1;
        return 0;
 }
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
 int core_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext &&
-            addr <= (unsigned long)_etext)
+            addr < (unsigned long)_etext)
                return 1;
        if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/padata.c b/kernel/padata.c
index 07af2c95dcfe..2abd25d79cc8 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 static int padata_cpu_hash(struct parallel_data *pd)
 {
+        unsigned int seq_nr;
        int cpu_index;
        /*
@@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd)
         * seq_nr mod. number of cpus in use.
         */
-        spin_lock(&pd->seq_lock);
+        seq_nr = atomic_inc_return(&pd->seq_nr);
-        cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+        cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
-        pd->seq_nr++;
-        spin_unlock(&pd->seq_lock);
        return padata_index_to_cpu(pd, cpu_index);
 }
@@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
        padata_init_pqueues(pd);
        padata_init_squeues(pd);
        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
-        pd->seq_nr = 0;
+        atomic_set(&pd->seq_nr, -1);
        atomic_set(&pd->reorder_objects, 0);
        atomic_set(&pd->refcnt, 0);
        pd->pinst = pinst;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 22fa55696760..0e9f9eaade2f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-        if (unlikely(ftrace_disabled))
-                return -ENODEV;
        if (FTRACE_WARN_ON(ops == &global_ops))
                return -EINVAL;
@@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
        int ret;
-        if (ftrace_disabled)
-                return -ENODEV;
        if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
                return -EBUSY;
@@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command)
 static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
        bool hash_enable = true;
+        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
+        ret = __register_ftrace_function(ops);
+        if (ret)
+                return ret;
        ftrace_start_up++;
        command |= FTRACE_UPDATE_CALLS;
@@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
        return 0;
 }
-static void ftrace_shutdown(struct ftrace_ops *ops, int command)
+static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 {
        bool hash_disable = true;
+        int ret;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
+        ret = __unregister_ftrace_function(ops);
+        if (ret)
+                return ret;
        ftrace_start_up--;
        /*
@@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
        }
        if (!command || !ftrace_enabled)
-                return;
+                return 0;
        ftrace_run_update_code(command);
+        return 0;
 }
 static void ftrace_startup_sysctl(void)
@@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void)
        if (i == FTRACE_FUNC_HASHSIZE)
                return;
-        ret = __register_ftrace_function(&trace_probe_ops);
+        ret = ftrace_startup(&trace_probe_ops, 0);
-        if (!ret)
-                ret = ftrace_startup(&trace_probe_ops, 0);
        ftrace_probe_registered = 1;
 }
 static void __disable_ftrace_function_probe(void)
 {
-        int ret;
        int i;
        if (!ftrace_probe_registered)
@@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void)
        }
        /* no more funcs left */
-        ret = __unregister_ftrace_function(&trace_probe_ops);
+        ftrace_shutdown(&trace_probe_ops, 0);
-        if (!ret)
-                ftrace_shutdown(&trace_probe_ops, 0);
        ftrace_probe_registered = 0;
 }
@@ -4366,12 +4366,15 @@ core_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command)                   \
+# define ftrace_startup(ops, command)                                   \
-        ({                                              \
+        ({                                                              \
-                (ops)->flags |= FTRACE_OPS_FL_ENABLED;  \
+                int ___ret = __register_ftrace_function(ops);           \
-                0;                                      \
+                if (!___ret)                                            \
+                        (ops)->flags |= FTRACE_OPS_FL_ENABLED;          \
+                ___ret;                                                 \
        })
-# define ftrace_shutdown(ops, command)  do { } while (0)
+# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
 # define ftrace_startup_sysctl()        do { } while (0)
 # define ftrace_shutdown_sysctl()       do { } while (0)
@@ -4780,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
-        ret = __register_ftrace_function(ops);
+        ret = ftrace_startup(ops, 0);
-        if (!ret)
-                ret = ftrace_startup(ops, 0);
        mutex_unlock(&ftrace_lock);
@@ -4801,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
        int ret;
        mutex_lock(&ftrace_lock);
-        ret = __unregister_ftrace_function(ops);
+        ret = ftrace_shutdown(ops, 0);
-        if (!ret)
-                ftrace_shutdown(ops, 0);
        mutex_unlock(&ftrace_lock);
        return ret;
@@ -4997,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
        return NOTIFY_DONE;
 }
+/* Just a place holder for function graph */
+static struct ftrace_ops fgraph_ops __read_mostly = {
+        .func           = ftrace_stub,
+        .flags          = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
+                                FTRACE_OPS_FL_RECURSION_SAFE,
+};
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                        trace_func_graph_ent_t entryfunc)
 {
@@ -5023,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_graph_return = retfunc;
        ftrace_graph_entry = entryfunc;
-        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+        ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
@@ -5040,7 +5046,7 @@ void unregister_ftrace_graph(void)
        ftrace_graph_active--;
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
-        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+        ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d03ebc..c66912be990f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
 /* I: attributes used when instantiating standard unbound pools on demand */
 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
 struct workqueue_struct *system_wq __read_mostly;
 EXPORT_SYMBOL(system_wq);
 struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }
 static inline void debug_work_deactivate(struct work_struct *work) { }
 #endif
-/* allocate ID and assign it to @pool */
+/**
+ * worker_pool_assign_id - allocate ID and assing it to @pool
+ * @pool: the pool pointer of interest
+ *
+ * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
+ * successfully, -errno on failure.
+ */
 static int worker_pool_assign_id(struct worker_pool *pool)
 {
        int ret;
        lockdep_assert_held(&wq_pool_mutex);
-        ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
+                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
-        /* if dying, only works from the same workqueue are allowed */
+        /* if draining, only works from the same workqueue are allowed */
        if (unlikely(wq->flags & __WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)
        if (IS_ERR(worker->task))
                goto fail;
+        set_user_nice(worker->task, pool->attrs->nice);
+        /* prevent userland from meddling with cpumask of workqueue workers */
+        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
         * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
         * online CPUs.  It'll be re-applied when any of the CPUs come up.
         */
-        set_user_nice(worker->task, pool->attrs->nice);
        set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
-        /* prevent userland from meddling with cpumask of workqueue workers */
-        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
         * The caller is responsible for ensuring %POOL_DISASSOCIATED
         * remains stable across this function.  See the comments above the
@@ -4106,7 +4117,7 @@ out_unlock:
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
        bool highpri = wq->flags & WQ_HIGHPRI;
-        int cpu;
+        int cpu, ret;
        if (!(wq->flags & WQ_UNBOUND)) {
                wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4126,6 +4137,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
                        mutex_unlock(&wq->mutex);
                }
                return 0;
+        } else if (wq->flags & __WQ_ORDERED) {
+                ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+                /* there should only be single pwq for ordering guarantee */
+                WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+                              wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+                     "ordering guarantee broken for workqueue %s\n", wq->name);
+                return ret;
        } else {
                return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
@@ -5009,10 +5027,6 @@ static int __init init_workqueues(void)
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        int i, cpu;
-        /* make sure we have enough bits for OFFQ pool ID */
-        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
-                     WORK_CPU_END * NR_STD_WORKER_POOLS);
        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5051,13 +5065,23 @@ static int __init init_workqueues(void)
                }
        }
-        /* create default unbound wq attrs */
+        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;
                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;
+                /*
+                 * An ordered wq should have only one pwq as ordering is
+                 * guaranteed by max_active which is enforced by pwqs.
+                 * Turn off NUMA so that dfl_pwq is used for all nodes.
+                 */
+                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+                attrs->nice = std_nice[i];
+                attrs->no_numa = true;
+                ordered_wq_attrs[i] = attrs;
        }
        system_wq = alloc_workqueue("events", 0, 0);