workqueue: reimplement WQ_HIGHPRI using a separate worker_pool

WQ_HIGHPRI was implemented by queueing highpri work items at the head of the global worklist. Other than queueing at the head, they weren't handled differently; unfortunately, this could lead to execution latency of a few seconds on heavily loaded systems. Now that workqueue code has been updated to deal with multiple worker_pools per global_cwq, this patch reimplements WQ_HIGHPRI using a separate worker_pool. NR_WORKER_POOLS is bumped to two and gcwq->pools[0] is used for normal pri work items and ->pools[1] for highpri. Highpri workers get -20 nice level and has 'H' suffix in their names. Note that this change increases the number of kworkers per cpu. POOL_HIGHPRI_PENDING, pool_determine_ins_pos() and highpri chain wakeup code in process_one_work() are no longer used and removed. This allows proper prioritization of highpri work items and removes high execution latency of highpri work items. v2: nr_running indexing bug in get_pool_nr_running() fixed. v3: Refreshed for the get_pool_nr_running() update in the previous patch. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Josh Hunt <joshhunt00@gmail.com> LKML-Reference: <CAKA=qzaHqwZ8eqpLNFjxnO2fX-tgAOjmpvxgBFjv6dJeQaOW1w@mail.gmail.com> Cc: Tony Luck <tony.luck@intel.com> Cc: Fengguang Wu <fengguang.wu@intel.com>
author: Tejun Heo <tj@kernel.org> 2012-07-14 01:16:45 -0400
committer: Tejun Heo <tj@kernel.org> 2012-07-14 01:24:45 -0400
commit: 3270476a6c0ce322354df8679652f060d66526dc (patch)
tree: db58846beb7c5e1c1b50b7e2f1c2538320408c26 /kernel/workqueue.c
parent: 4ce62e9e30cacc26885cab133ad1de358dd79f21 (diff)
1 files changed, 27 insertions, 73 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b0daaea44eaa..4fa9e3552f1e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -52,7 +52,6 @@ enum {
        /* pool flags */
        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
-        POOL_HIGHPRI_PENDING    = 1 << 2,       /* highpri works on queue */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
@@ -74,7 +73,7 @@ enum {
        TRUSTEE_RELEASE         = 3,            /* release workers */
        TRUSTEE_DONE            = 4,            /* trustee is done */
-        NR_WORKER_POOLS         = 1,            /* # worker pools per gcwq */
+        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
@@ -95,6 +94,7 @@ enum {
         * all cpus.  Give -20.
         */
        RESCUER_NICE_LEVEL      = -20,
+        HIGHPRI_NICE_LEVEL      = -20,
 };
 /*
@@ -174,7 +174,7 @@ struct global_cwq {
        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
-        struct worker_pool      pool;           /* the worker pools */
+        struct worker_pool      pools[2];       /* normal and highpri pools */
        struct task_struct      *trustee;       /* L: for gcwq shutdown */
        unsigned int            trustee_state;  /* L: trustee state */
@@ -277,7 +277,8 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #include <trace/events/workqueue.h>
 #define for_each_worker_pool(pool, gcwq)                                \
-        for ((pool) = &(gcwq)->pool; (pool); (pool) = NULL)
+        for ((pool) = &(gcwq)->pools[0];                                \
+             (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
@@ -473,6 +474,11 @@ static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
 static int worker_thread(void *__worker);
+static int worker_pool_pri(struct worker_pool *pool)
+{
+        return pool - pool->gcwq->pools;
+}
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
        if (cpu != WORK_CPU_UNBOUND)
@@ -484,7 +490,7 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
        int cpu = pool->gcwq->cpu;
-        int idx = 0;
+        int idx = worker_pool_pri(pool);
        if (cpu != WORK_CPU_UNBOUND)
                return &per_cpu(pool_nr_running, cpu)[idx];
@@ -586,15 +592,14 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 }
 /*
- * Policy functions.  These define the policies on how the global
+ * Policy functions.  These define the policies on how the global worker
- * worker pool is managed.  Unless noted otherwise, these functions
+ * pools are managed.  Unless noted otherwise, these functions assume that
- * assume that they're being called with gcwq->lock held.
+ * they're being called with gcwq->lock held.
 */
 static bool __need_more_worker(struct worker_pool *pool)
 {
-        return !atomic_read(get_pool_nr_running(pool)) ||
+        return !atomic_read(get_pool_nr_running(pool));
-                (pool->flags & POOL_HIGHPRI_PENDING);
 }
 /*
@@ -621,9 +626,7 @@ static bool keep_working(struct worker_pool *pool)
 {
        atomic_t *nr_running = get_pool_nr_running(pool);
-        return !list_empty(&pool->worklist) &&
+        return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
-                (atomic_read(nr_running) <= 1 ||
-                 (pool->flags & POOL_HIGHPRI_PENDING));
 }
 /* Do we need a new worker?  Called from manager. */
@@ -892,43 +895,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 /**
- * pool_determine_ins_pos - find insertion position
- * @pool: pool of interest
- * @cwq: cwq a work is being queued for
- *
- * A work for @cwq is about to be queued on @pool, determine insertion
- * position for the work.  If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets POOL_HIGHPRI_PENDING flag to hint @pool that
- * there are HIGHPRI works pending.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to inserstion position.
- */
-static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool,
-                                               struct cpu_workqueue_struct *cwq)
-{
-        struct work_struct *twork;
-        if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-                return &pool->worklist;
-        list_for_each_entry(twork, &pool->worklist, entry) {
-                struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
-                if (!(tcwq->wq->flags & WQ_HIGHPRI))
-                        break;
-        }
-        pool->flags |= POOL_HIGHPRI_PENDING;
-        return &twork->entry;
-}
-/**
 * insert_work - insert a work into gcwq
 * @cwq: cwq @work belongs to
 * @work: work to insert
@@ -1068,7 +1034,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (likely(cwq->nr_active < cwq->max_active)) {
                trace_workqueue_activate_work(work);
                cwq->nr_active++;
-                worklist = pool_determine_ins_pos(cwq->pool, cwq);
+                worklist = &cwq->pool->worklist;
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
                worklist = &cwq->delayed_works;
@@ -1385,6 +1351,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 {
        struct global_cwq *gcwq = pool->gcwq;
        bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+        const char *pri = worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
@@ -1406,15 +1373,17 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
        if (!on_unbound_cpu)
                worker->task = kthread_create_on_node(worker_thread,
-                                                      worker,
+                                        worker, cpu_to_node(gcwq->cpu),
-                                                      cpu_to_node(gcwq->cpu),
+                                        "kworker/%u:%d%s", gcwq->cpu, id, pri);
-                                                      "kworker/%u:%d", gcwq->cpu, id);
        else
                worker->task = kthread_create(worker_thread, worker,
-                                              "kworker/u:%d", id);
+                                              "kworker/u:%d%s", id, pri);
        if (IS_ERR(worker->task))
                goto fail;
+        if (worker_pool_pri(pool))
+                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
        /*
         * A rogue worker will become a regular one if CPU comes
         * online later on.  Make sure every worker has
@@ -1761,10 +1730,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
        struct work_struct *work = list_first_entry(&cwq->delayed_works,
                                                    struct work_struct, entry);
-        struct list_head *pos = pool_determine_ins_pos(cwq->pool, cwq);
        trace_workqueue_activate_work(work);
-        move_linked_works(work, pos, NULL);
+        move_linked_works(work, &cwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
        cwq->nr_active++;
 }
@@ -1880,21 +1848,6 @@ __acquires(&gcwq->lock)
        list_del_init(&work->entry);
        /*
-         * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
-         * wake up another worker; otherwise, clear HIGHPRI_PENDING.
-         */
-        if (unlikely(pool->flags & POOL_HIGHPRI_PENDING)) {
-                struct work_struct *nwork = list_first_entry(&pool->worklist,
-                                         struct work_struct, entry);
-                if (!list_empty(&pool->worklist) &&
-                    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-                        wake_up_worker(pool);
-                else
-                        pool->flags &= ~POOL_HIGHPRI_PENDING;
-        }
-        /*
         * CPU intensive works don't participate in concurrency
         * management.  They're the scheduler's responsibility.
         */
@@ -3047,9 +3000,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                struct global_cwq *gcwq = get_gcwq(cpu);
+                int pool_idx = (bool)(flags & WQ_HIGHPRI);
                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-                cwq->pool = &gcwq->pool;
+                cwq->pool = &gcwq->pools[pool_idx];
                cwq->wq = wq;
                cwq->flush_color = -1;
                cwq->max_active = max_active;
author	Tejun Heo <tj@kernel.org>	2012-07-14 01:16:45 -0400
committer	Tejun Heo <tj@kernel.org>	2012-07-14 01:24:45 -0400
commit	3270476a6c0ce322354df8679652f060d66526dc (patch)
tree	db58846beb7c5e1c1b50b7e2f1c2538320408c26 /kernel/workqueue.c
parent	4ce62e9e30cacc26885cab133ad1de358dd79f21 (diff)