aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/workqueue.txt103
-rw-r--r--kernel/workqueue.c100
2 files changed, 65 insertions, 138 deletions
diff --git a/Documentation/workqueue.txt b/Documentation/workqueue.txt
index a0b577de918f..a6ab4b62d926 100644
--- a/Documentation/workqueue.txt
+++ b/Documentation/workqueue.txt
@@ -89,25 +89,28 @@ called thread-pools.
89 89
90The cmwq design differentiates between the user-facing workqueues that 90The cmwq design differentiates between the user-facing workqueues that
91subsystems and drivers queue work items on and the backend mechanism 91subsystems and drivers queue work items on and the backend mechanism
92which manages thread-pool and processes the queued work items. 92which manages thread-pools and processes the queued work items.
93 93
94The backend is called gcwq. There is one gcwq for each possible CPU 94The backend is called gcwq. There is one gcwq for each possible CPU
95and one gcwq to serve work items queued on unbound workqueues. 95and one gcwq to serve work items queued on unbound workqueues. Each
96gcwq has two thread-pools - one for normal work items and the other
97for high priority ones.
96 98
97Subsystems and drivers can create and queue work items through special 99Subsystems and drivers can create and queue work items through special
98workqueue API functions as they see fit. They can influence some 100workqueue API functions as they see fit. They can influence some
99aspects of the way the work items are executed by setting flags on the 101aspects of the way the work items are executed by setting flags on the
100workqueue they are putting the work item on. These flags include 102workqueue they are putting the work item on. These flags include
101things like CPU locality, reentrancy, concurrency limits and more. To 103things like CPU locality, reentrancy, concurrency limits, priority and
102get a detailed overview refer to the API description of 104more. To get a detailed overview refer to the API description of
103alloc_workqueue() below. 105alloc_workqueue() below.
104 106
105When a work item is queued to a workqueue, the target gcwq is 107When a work item is queued to a workqueue, the target gcwq and
106determined according to the queue parameters and workqueue attributes 108thread-pool is determined according to the queue parameters and
107and appended on the shared worklist of the gcwq. For example, unless 109workqueue attributes and appended on the shared worklist of the
108specifically overridden, a work item of a bound workqueue will be 110thread-pool. For example, unless specifically overridden, a work item
109queued on the worklist of exactly that gcwq that is associated to the 111of a bound workqueue will be queued on the worklist of either normal
110CPU the issuer is running on. 112or highpri thread-pool of the gcwq that is associated to the CPU the
113issuer is running on.
111 114
112For any worker pool implementation, managing the concurrency level 115For any worker pool implementation, managing the concurrency level
113(how many execution contexts are active) is an important issue. cmwq 116(how many execution contexts are active) is an important issue. cmwq
@@ -115,26 +118,26 @@ tries to keep the concurrency at a minimal but sufficient level.
115Minimal to save resources and sufficient in that the system is used at 118Minimal to save resources and sufficient in that the system is used at
116its full capacity. 119its full capacity.
117 120
118Each gcwq bound to an actual CPU implements concurrency management by 121Each thread-pool bound to an actual CPU implements concurrency
119hooking into the scheduler. The gcwq is notified whenever an active 122management by hooking into the scheduler. The thread-pool is notified
120worker wakes up or sleeps and keeps track of the number of the 123whenever an active worker wakes up or sleeps and keeps track of the
121currently runnable workers. Generally, work items are not expected to 124number of the currently runnable workers. Generally, work items are
122hog a CPU and consume many cycles. That means maintaining just enough 125not expected to hog a CPU and consume many cycles. That means
123concurrency to prevent work processing from stalling should be 126maintaining just enough concurrency to prevent work processing from
124optimal. As long as there are one or more runnable workers on the 127stalling should be optimal. As long as there are one or more runnable
125CPU, the gcwq doesn't start execution of a new work, but, when the 128workers on the CPU, the thread-pool doesn't start execution of a new
126last running worker goes to sleep, it immediately schedules a new 129work, but, when the last running worker goes to sleep, it immediately
127worker so that the CPU doesn't sit idle while there are pending work 130schedules a new worker so that the CPU doesn't sit idle while there
128items. This allows using a minimal number of workers without losing 131are pending work items. This allows using a minimal number of workers
129execution bandwidth. 132without losing execution bandwidth.
130 133
131Keeping idle workers around doesn't cost other than the memory space 134Keeping idle workers around doesn't cost other than the memory space
132for kthreads, so cmwq holds onto idle ones for a while before killing 135for kthreads, so cmwq holds onto idle ones for a while before killing
133them. 136them.
134 137
135For an unbound wq, the above concurrency management doesn't apply and 138For an unbound wq, the above concurrency management doesn't apply and
136the gcwq for the pseudo unbound CPU tries to start executing all work 139the thread-pools for the pseudo unbound CPU try to start executing all
137items as soon as possible. The responsibility of regulating 140work items as soon as possible. The responsibility of regulating
138concurrency level is on the users. There is also a flag to mark a 141concurrency level is on the users. There is also a flag to mark a
139bound wq to ignore the concurrency management. Please refer to the 142bound wq to ignore the concurrency management. Please refer to the
140API section for details. 143API section for details.
@@ -205,31 +208,22 @@ resources, scheduled and executed.
205 208
206 WQ_HIGHPRI 209 WQ_HIGHPRI
207 210
208 Work items of a highpri wq are queued at the head of the 211 Work items of a highpri wq are queued to the highpri
209 worklist of the target gcwq and start execution regardless of 212 thread-pool of the target gcwq. Highpri thread-pools are
210 the current concurrency level. In other words, highpri work 213 served by worker threads with elevated nice level.
211 items will always start execution as soon as execution
212 resource is available.
213 214
214 Ordering among highpri work items is preserved - a highpri 215 Note that normal and highpri thread-pools don't interact with
215 work item queued after another highpri work item will start 216 each other. Each maintain its separate pool of workers and
216 execution after the earlier highpri work item starts. 217 implements concurrency management among its workers.
217
218 Although highpri work items are not held back by other
219 runnable work items, they still contribute to the concurrency
220 level. Highpri work items in runnable state will prevent
221 non-highpri work items from starting execution.
222
223 This flag is meaningless for unbound wq.
224 218
225 WQ_CPU_INTENSIVE 219 WQ_CPU_INTENSIVE
226 220
227 Work items of a CPU intensive wq do not contribute to the 221 Work items of a CPU intensive wq do not contribute to the
228 concurrency level. In other words, runnable CPU intensive 222 concurrency level. In other words, runnable CPU intensive
229 work items will not prevent other work items from starting 223 work items will not prevent other work items in the same
230 execution. This is useful for bound work items which are 224 thread-pool from starting execution. This is useful for bound
231 expected to hog CPU cycles so that their execution is 225 work items which are expected to hog CPU cycles so that their
232 regulated by the system scheduler. 226 execution is regulated by the system scheduler.
233 227
234 Although CPU intensive work items don't contribute to the 228 Although CPU intensive work items don't contribute to the
235 concurrency level, start of their executions is still 229 concurrency level, start of their executions is still
@@ -239,14 +233,6 @@ resources, scheduled and executed.
239 233
240 This flag is meaningless for unbound wq. 234 This flag is meaningless for unbound wq.
241 235
242 WQ_HIGHPRI | WQ_CPU_INTENSIVE
243
244 This combination makes the wq avoid interaction with
245 concurrency management completely and behave as a simple
246 per-CPU execution context provider. Work items queued on a
247 highpri CPU-intensive wq start execution as soon as resources
248 are available and don't affect execution of other work items.
249
250@max_active: 236@max_active:
251 237
252@max_active determines the maximum number of execution contexts per 238@max_active determines the maximum number of execution contexts per
@@ -328,20 +314,7 @@ If @max_active == 2,
328 35 w2 wakes up and finishes 314 35 w2 wakes up and finishes
329 315
330Now, let's assume w1 and w2 are queued to a different wq q1 which has 316Now, let's assume w1 and w2 are queued to a different wq q1 which has
331WQ_HIGHPRI set, 317WQ_CPU_INTENSIVE set,
332
333 TIME IN MSECS EVENT
334 0 w1 and w2 start and burn CPU
335 5 w1 sleeps
336 10 w2 sleeps
337 10 w0 starts and burns CPU
338 15 w0 sleeps
339 15 w1 wakes up and finishes
340 20 w2 wakes up and finishes
341 25 w0 wakes up and burns CPU
342 30 w0 finishes
343
344If q1 has WQ_CPU_INTENSIVE set,
345 318
346 TIME IN MSECS EVENT 319 TIME IN MSECS EVENT
347 0 w0 starts and burns CPU 320 0 w0 starts and burns CPU
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b0daaea44eaa..4fa9e3552f1e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -52,7 +52,6 @@ enum {
52 /* pool flags */ 52 /* pool flags */
53 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 53 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
54 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ 54 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
55 POOL_HIGHPRI_PENDING = 1 << 2, /* highpri works on queue */
56 55
57 /* worker flags */ 56 /* worker flags */
58 WORKER_STARTED = 1 << 0, /* started */ 57 WORKER_STARTED = 1 << 0, /* started */
@@ -74,7 +73,7 @@ enum {
74 TRUSTEE_RELEASE = 3, /* release workers */ 73 TRUSTEE_RELEASE = 3, /* release workers */
75 TRUSTEE_DONE = 4, /* trustee is done */ 74 TRUSTEE_DONE = 4, /* trustee is done */
76 75
77 NR_WORKER_POOLS = 1, /* # worker pools per gcwq */ 76 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
78 77
79 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 78 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
80 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, 79 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
@@ -95,6 +94,7 @@ enum {
95 * all cpus. Give -20. 94 * all cpus. Give -20.
96 */ 95 */
97 RESCUER_NICE_LEVEL = -20, 96 RESCUER_NICE_LEVEL = -20,
97 HIGHPRI_NICE_LEVEL = -20,
98}; 98};
99 99
100/* 100/*
@@ -174,7 +174,7 @@ struct global_cwq {
174 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; 174 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
175 /* L: hash of busy workers */ 175 /* L: hash of busy workers */
176 176
177 struct worker_pool pool; /* the worker pools */ 177 struct worker_pool pools[2]; /* normal and highpri pools */
178 178
179 struct task_struct *trustee; /* L: for gcwq shutdown */ 179 struct task_struct *trustee; /* L: for gcwq shutdown */
180 unsigned int trustee_state; /* L: trustee state */ 180 unsigned int trustee_state; /* L: trustee state */
@@ -277,7 +277,8 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
277#include <trace/events/workqueue.h> 277#include <trace/events/workqueue.h>
278 278
279#define for_each_worker_pool(pool, gcwq) \ 279#define for_each_worker_pool(pool, gcwq) \
280 for ((pool) = &(gcwq)->pool; (pool); (pool) = NULL) 280 for ((pool) = &(gcwq)->pools[0]; \
281 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
281 282
282#define for_each_busy_worker(worker, i, pos, gcwq) \ 283#define for_each_busy_worker(worker, i, pos, gcwq) \
283 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 284 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
@@ -473,6 +474,11 @@ static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
473 474
474static int worker_thread(void *__worker); 475static int worker_thread(void *__worker);
475 476
477static int worker_pool_pri(struct worker_pool *pool)
478{
479 return pool - pool->gcwq->pools;
480}
481
476static struct global_cwq *get_gcwq(unsigned int cpu) 482static struct global_cwq *get_gcwq(unsigned int cpu)
477{ 483{
478 if (cpu != WORK_CPU_UNBOUND) 484 if (cpu != WORK_CPU_UNBOUND)
@@ -484,7 +490,7 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
484static atomic_t *get_pool_nr_running(struct worker_pool *pool) 490static atomic_t *get_pool_nr_running(struct worker_pool *pool)
485{ 491{
486 int cpu = pool->gcwq->cpu; 492 int cpu = pool->gcwq->cpu;
487 int idx = 0; 493 int idx = worker_pool_pri(pool);
488 494
489 if (cpu != WORK_CPU_UNBOUND) 495 if (cpu != WORK_CPU_UNBOUND)
490 return &per_cpu(pool_nr_running, cpu)[idx]; 496 return &per_cpu(pool_nr_running, cpu)[idx];
@@ -586,15 +592,14 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
586} 592}
587 593
588/* 594/*
589 * Policy functions. These define the policies on how the global 595 * Policy functions. These define the policies on how the global worker
590 * worker pool is managed. Unless noted otherwise, these functions 596 * pools are managed. Unless noted otherwise, these functions assume that
591 * assume that they're being called with gcwq->lock held. 597 * they're being called with gcwq->lock held.
592 */ 598 */
593 599
594static bool __need_more_worker(struct worker_pool *pool) 600static bool __need_more_worker(struct worker_pool *pool)
595{ 601{
596 return !atomic_read(get_pool_nr_running(pool)) || 602 return !atomic_read(get_pool_nr_running(pool));
597 (pool->flags & POOL_HIGHPRI_PENDING);
598} 603}
599 604
600/* 605/*
@@ -621,9 +626,7 @@ static bool keep_working(struct worker_pool *pool)
621{ 626{
622 atomic_t *nr_running = get_pool_nr_running(pool); 627 atomic_t *nr_running = get_pool_nr_running(pool);
623 628
624 return !list_empty(&pool->worklist) && 629 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
625 (atomic_read(nr_running) <= 1 ||
626 (pool->flags & POOL_HIGHPRI_PENDING));
627} 630}
628 631
629/* Do we need a new worker? Called from manager. */ 632/* Do we need a new worker? Called from manager. */
@@ -892,43 +895,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
892} 895}
893 896
894/** 897/**
895 * pool_determine_ins_pos - find insertion position
896 * @pool: pool of interest
897 * @cwq: cwq a work is being queued for
898 *
899 * A work for @cwq is about to be queued on @pool, determine insertion
900 * position for the work. If @cwq is for HIGHPRI wq, the work is
901 * queued at the head of the queue but in FIFO order with respect to
902 * other HIGHPRI works; otherwise, at the end of the queue. This
903 * function also sets POOL_HIGHPRI_PENDING flag to hint @pool that
904 * there are HIGHPRI works pending.
905 *
906 * CONTEXT:
907 * spin_lock_irq(gcwq->lock).
908 *
909 * RETURNS:
910 * Pointer to inserstion position.
911 */
912static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool,
913 struct cpu_workqueue_struct *cwq)
914{
915 struct work_struct *twork;
916
917 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
918 return &pool->worklist;
919
920 list_for_each_entry(twork, &pool->worklist, entry) {
921 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
922
923 if (!(tcwq->wq->flags & WQ_HIGHPRI))
924 break;
925 }
926
927 pool->flags |= POOL_HIGHPRI_PENDING;
928 return &twork->entry;
929}
930
931/**
932 * insert_work - insert a work into gcwq 898 * insert_work - insert a work into gcwq
933 * @cwq: cwq @work belongs to 899 * @cwq: cwq @work belongs to
934 * @work: work to insert 900 * @work: work to insert
@@ -1068,7 +1034,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1068 if (likely(cwq->nr_active < cwq->max_active)) { 1034 if (likely(cwq->nr_active < cwq->max_active)) {
1069 trace_workqueue_activate_work(work); 1035 trace_workqueue_activate_work(work);
1070 cwq->nr_active++; 1036 cwq->nr_active++;
1071 worklist = pool_determine_ins_pos(cwq->pool, cwq); 1037 worklist = &cwq->pool->worklist;
1072 } else { 1038 } else {
1073 work_flags |= WORK_STRUCT_DELAYED; 1039 work_flags |= WORK_STRUCT_DELAYED;
1074 worklist = &cwq->delayed_works; 1040 worklist = &cwq->delayed_works;
@@ -1385,6 +1351,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
1385{ 1351{
1386 struct global_cwq *gcwq = pool->gcwq; 1352 struct global_cwq *gcwq = pool->gcwq;
1387 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; 1353 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1354 const char *pri = worker_pool_pri(pool) ? "H" : "";
1388 struct worker *worker = NULL; 1355 struct worker *worker = NULL;
1389 int id = -1; 1356 int id = -1;
1390 1357
@@ -1406,15 +1373,17 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
1406 1373
1407 if (!on_unbound_cpu) 1374 if (!on_unbound_cpu)
1408 worker->task = kthread_create_on_node(worker_thread, 1375 worker->task = kthread_create_on_node(worker_thread,
1409 worker, 1376 worker, cpu_to_node(gcwq->cpu),
1410 cpu_to_node(gcwq->cpu), 1377 "kworker/%u:%d%s", gcwq->cpu, id, pri);
1411 "kworker/%u:%d", gcwq->cpu, id);
1412 else 1378 else
1413 worker->task = kthread_create(worker_thread, worker, 1379 worker->task = kthread_create(worker_thread, worker,
1414 "kworker/u:%d", id); 1380 "kworker/u:%d%s", id, pri);
1415 if (IS_ERR(worker->task)) 1381 if (IS_ERR(worker->task))
1416 goto fail; 1382 goto fail;
1417 1383
1384 if (worker_pool_pri(pool))
1385 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1386
1418 /* 1387 /*
1419 * A rogue worker will become a regular one if CPU comes 1388 * A rogue worker will become a regular one if CPU comes
1420 * online later on. Make sure every worker has 1389 * online later on. Make sure every worker has
@@ -1761,10 +1730,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1761{ 1730{
1762 struct work_struct *work = list_first_entry(&cwq->delayed_works, 1731 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1763 struct work_struct, entry); 1732 struct work_struct, entry);
1764 struct list_head *pos = pool_determine_ins_pos(cwq->pool, cwq);
1765 1733
1766 trace_workqueue_activate_work(work); 1734 trace_workqueue_activate_work(work);
1767 move_linked_works(work, pos, NULL); 1735 move_linked_works(work, &cwq->pool->worklist, NULL);
1768 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 1736 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1769 cwq->nr_active++; 1737 cwq->nr_active++;
1770} 1738}
@@ -1880,21 +1848,6 @@ __acquires(&gcwq->lock)
1880 list_del_init(&work->entry); 1848 list_del_init(&work->entry);
1881 1849
1882 /* 1850 /*
1883 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1884 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1885 */
1886 if (unlikely(pool->flags & POOL_HIGHPRI_PENDING)) {
1887 struct work_struct *nwork = list_first_entry(&pool->worklist,
1888 struct work_struct, entry);
1889
1890 if (!list_empty(&pool->worklist) &&
1891 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1892 wake_up_worker(pool);
1893 else
1894 pool->flags &= ~POOL_HIGHPRI_PENDING;
1895 }
1896
1897 /*
1898 * CPU intensive works don't participate in concurrency 1851 * CPU intensive works don't participate in concurrency
1899 * management. They're the scheduler's responsibility. 1852 * management. They're the scheduler's responsibility.
1900 */ 1853 */
@@ -3047,9 +3000,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3047 for_each_cwq_cpu(cpu, wq) { 3000 for_each_cwq_cpu(cpu, wq) {
3048 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3001 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3049 struct global_cwq *gcwq = get_gcwq(cpu); 3002 struct global_cwq *gcwq = get_gcwq(cpu);
3003 int pool_idx = (bool)(flags & WQ_HIGHPRI);
3050 3004
3051 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3005 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
3052 cwq->pool = &gcwq->pool; 3006 cwq->pool = &gcwq->pools[pool_idx];
3053 cwq->wq = wq; 3007 cwq->wq = wq;
3054 cwq->flush_color = -1; 3008 cwq->flush_color = -1;
3055 cwq->max_active = max_active; 3009 cwq->max_active = max_active;