diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 20:46:16 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 20:46:16 -0400 |
commit | a08489c569dc174cff97d2cb165aa81e3f1501cc (patch) | |
tree | c583700a11bab82ea864425004dd5bb03bf8a987 | |
parent | 08d9329c29ec98477e8ac2f7a513f2bfa3e9f3c5 (diff) | |
parent | 6fec10a1a5866dda3cd6a825a521fc7c2f226ba5 (diff) |
Merge branch 'for-3.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
Pull workqueue changes from Tejun Heo:
"There are three major changes.
- WQ_HIGHPRI has been reimplemented so that high priority work items
are served by worker threads with -20 nice value from dedicated
highpri worker pools.
- CPU hotplug support has been reimplemented such that idle workers
are kept across CPU hotplug events. This makes CPU hotplug cheaper
(for PM) and makes the code simpler.
- flush_kthread_work() has been reimplemented so that a work item can
be freed while executing. This removes an annoying behavior
difference between kthread_worker and workqueue."
* 'for-3.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
workqueue: fix spurious CPU locality WARN from process_one_work()
kthread_worker: reimplement flush_kthread_work() to allow freeing the work item being executed
kthread_worker: reorganize to prepare for flush_kthread_work() reimplementation
workqueue: simplify CPU hotplug code
workqueue: remove CPU offline trustee
workqueue: don't butcher idle workers on an offline CPU
workqueue: reimplement CPU online rebinding to handle idle workers
workqueue: drop @bind from create_worker()
workqueue: use mutex for global_cwq manager exclusion
workqueue: ROGUE workers are UNBOUND workers
workqueue: drop CPU_DYING notifier operation
workqueue: perform cpu down operations from low priority cpu_notifier()
workqueue: reimplement WQ_HIGHPRI using a separate worker_pool
workqueue: introduce NR_WORKER_POOLS and for_each_worker_pool()
workqueue: separate out worker_pool flags
workqueue: use @pool instead of @gcwq or @cpu where applicable
workqueue: factor out worker_pool from global_cwq
workqueue: don't use WQ_HIGHPRI for unbound workqueues
-rw-r--r-- | Documentation/workqueue.txt | 103 | ||||
-rw-r--r-- | include/linux/cpu.h | 5 | ||||
-rw-r--r-- | include/linux/kthread.h | 8 | ||||
-rw-r--r-- | include/trace/events/workqueue.h | 2 | ||||
-rw-r--r-- | kernel/kthread.c | 88 | ||||
-rw-r--r-- | kernel/workqueue.c | 1144 |
6 files changed, 628 insertions, 722 deletions
diff --git a/Documentation/workqueue.txt b/Documentation/workqueue.txt index a0b577de918f..a6ab4b62d926 100644 --- a/Documentation/workqueue.txt +++ b/Documentation/workqueue.txt | |||
@@ -89,25 +89,28 @@ called thread-pools. | |||
89 | 89 | ||
90 | The cmwq design differentiates between the user-facing workqueues that | 90 | The cmwq design differentiates between the user-facing workqueues that |
91 | subsystems and drivers queue work items on and the backend mechanism | 91 | subsystems and drivers queue work items on and the backend mechanism |
92 | which manages thread-pool and processes the queued work items. | 92 | which manages thread-pools and processes the queued work items. |
93 | 93 | ||
94 | The backend is called gcwq. There is one gcwq for each possible CPU | 94 | The backend is called gcwq. There is one gcwq for each possible CPU |
95 | and one gcwq to serve work items queued on unbound workqueues. | 95 | and one gcwq to serve work items queued on unbound workqueues. Each |
96 | gcwq has two thread-pools - one for normal work items and the other | ||
97 | for high priority ones. | ||
96 | 98 | ||
97 | Subsystems and drivers can create and queue work items through special | 99 | Subsystems and drivers can create and queue work items through special |
98 | workqueue API functions as they see fit. They can influence some | 100 | workqueue API functions as they see fit. They can influence some |
99 | aspects of the way the work items are executed by setting flags on the | 101 | aspects of the way the work items are executed by setting flags on the |
100 | workqueue they are putting the work item on. These flags include | 102 | workqueue they are putting the work item on. These flags include |
101 | things like CPU locality, reentrancy, concurrency limits and more. To | 103 | things like CPU locality, reentrancy, concurrency limits, priority and |
102 | get a detailed overview refer to the API description of | 104 | more. To get a detailed overview refer to the API description of |
103 | alloc_workqueue() below. | 105 | alloc_workqueue() below. |
104 | 106 | ||
105 | When a work item is queued to a workqueue, the target gcwq is | 107 | When a work item is queued to a workqueue, the target gcwq and |
106 | determined according to the queue parameters and workqueue attributes | 108 | thread-pool is determined according to the queue parameters and |
107 | and appended on the shared worklist of the gcwq. For example, unless | 109 | workqueue attributes and appended on the shared worklist of the |
108 | specifically overridden, a work item of a bound workqueue will be | 110 | thread-pool. For example, unless specifically overridden, a work item |
109 | queued on the worklist of exactly that gcwq that is associated to the | 111 | of a bound workqueue will be queued on the worklist of either normal |
110 | CPU the issuer is running on. | 112 | or highpri thread-pool of the gcwq that is associated to the CPU the |
113 | issuer is running on. | ||
111 | 114 | ||
112 | For any worker pool implementation, managing the concurrency level | 115 | For any worker pool implementation, managing the concurrency level |
113 | (how many execution contexts are active) is an important issue. cmwq | 116 | (how many execution contexts are active) is an important issue. cmwq |
@@ -115,26 +118,26 @@ tries to keep the concurrency at a minimal but sufficient level. | |||
115 | Minimal to save resources and sufficient in that the system is used at | 118 | Minimal to save resources and sufficient in that the system is used at |
116 | its full capacity. | 119 | its full capacity. |
117 | 120 | ||
118 | Each gcwq bound to an actual CPU implements concurrency management by | 121 | Each thread-pool bound to an actual CPU implements concurrency |
119 | hooking into the scheduler. The gcwq is notified whenever an active | 122 | management by hooking into the scheduler. The thread-pool is notified |
120 | worker wakes up or sleeps and keeps track of the number of the | 123 | whenever an active worker wakes up or sleeps and keeps track of the |
121 | currently runnable workers. Generally, work items are not expected to | 124 | number of the currently runnable workers. Generally, work items are |
122 | hog a CPU and consume many cycles. That means maintaining just enough | 125 | not expected to hog a CPU and consume many cycles. That means |
123 | concurrency to prevent work processing from stalling should be | 126 | maintaining just enough concurrency to prevent work processing from |
124 | optimal. As long as there are one or more runnable workers on the | 127 | stalling should be optimal. As long as there are one or more runnable |
125 | CPU, the gcwq doesn't start execution of a new work, but, when the | 128 | workers on the CPU, the thread-pool doesn't start execution of a new |
126 | last running worker goes to sleep, it immediately schedules a new | 129 | work, but, when the last running worker goes to sleep, it immediately |
127 | worker so that the CPU doesn't sit idle while there are pending work | 130 | schedules a new worker so that the CPU doesn't sit idle while there |
128 | items. This allows using a minimal number of workers without losing | 131 | are pending work items. This allows using a minimal number of workers |
129 | execution bandwidth. | 132 | without losing execution bandwidth. |
130 | 133 | ||
131 | Keeping idle workers around doesn't cost other than the memory space | 134 | Keeping idle workers around doesn't cost other than the memory space |
132 | for kthreads, so cmwq holds onto idle ones for a while before killing | 135 | for kthreads, so cmwq holds onto idle ones for a while before killing |
133 | them. | 136 | them. |
134 | 137 | ||
135 | For an unbound wq, the above concurrency management doesn't apply and | 138 | For an unbound wq, the above concurrency management doesn't apply and |
136 | the gcwq for the pseudo unbound CPU tries to start executing all work | 139 | the thread-pools for the pseudo unbound CPU try to start executing all |
137 | items as soon as possible. The responsibility of regulating | 140 | work items as soon as possible. The responsibility of regulating |
138 | concurrency level is on the users. There is also a flag to mark a | 141 | concurrency level is on the users. There is also a flag to mark a |
139 | bound wq to ignore the concurrency management. Please refer to the | 142 | bound wq to ignore the concurrency management. Please refer to the |
140 | API section for details. | 143 | API section for details. |
@@ -205,31 +208,22 @@ resources, scheduled and executed. | |||
205 | 208 | ||
206 | WQ_HIGHPRI | 209 | WQ_HIGHPRI |
207 | 210 | ||
208 | Work items of a highpri wq are queued at the head of the | 211 | Work items of a highpri wq are queued to the highpri |
209 | worklist of the target gcwq and start execution regardless of | 212 | thread-pool of the target gcwq. Highpri thread-pools are |
210 | the current concurrency level. In other words, highpri work | 213 | served by worker threads with elevated nice level. |
211 | items will always start execution as soon as execution | ||
212 | resource is available. | ||
213 | 214 | ||
214 | Ordering among highpri work items is preserved - a highpri | 215 | Note that normal and highpri thread-pools don't interact with |
215 | work item queued after another highpri work item will start | 216 | each other. Each maintain its separate pool of workers and |
216 | execution after the earlier highpri work item starts. | 217 | implements concurrency management among its workers. |
217 | |||
218 | Although highpri work items are not held back by other | ||
219 | runnable work items, they still contribute to the concurrency | ||
220 | level. Highpri work items in runnable state will prevent | ||
221 | non-highpri work items from starting execution. | ||
222 | |||
223 | This flag is meaningless for unbound wq. | ||
224 | 218 | ||
225 | WQ_CPU_INTENSIVE | 219 | WQ_CPU_INTENSIVE |
226 | 220 | ||
227 | Work items of a CPU intensive wq do not contribute to the | 221 | Work items of a CPU intensive wq do not contribute to the |
228 | concurrency level. In other words, runnable CPU intensive | 222 | concurrency level. In other words, runnable CPU intensive |
229 | work items will not prevent other work items from starting | 223 | work items will not prevent other work items in the same |
230 | execution. This is useful for bound work items which are | 224 | thread-pool from starting execution. This is useful for bound |
231 | expected to hog CPU cycles so that their execution is | 225 | work items which are expected to hog CPU cycles so that their |
232 | regulated by the system scheduler. | 226 | execution is regulated by the system scheduler. |
233 | 227 | ||
234 | Although CPU intensive work items don't contribute to the | 228 | Although CPU intensive work items don't contribute to the |
235 | concurrency level, start of their executions is still | 229 | concurrency level, start of their executions is still |
@@ -239,14 +233,6 @@ resources, scheduled and executed. | |||
239 | 233 | ||
240 | This flag is meaningless for unbound wq. | 234 | This flag is meaningless for unbound wq. |
241 | 235 | ||
242 | WQ_HIGHPRI | WQ_CPU_INTENSIVE | ||
243 | |||
244 | This combination makes the wq avoid interaction with | ||
245 | concurrency management completely and behave as a simple | ||
246 | per-CPU execution context provider. Work items queued on a | ||
247 | highpri CPU-intensive wq start execution as soon as resources | ||
248 | are available and don't affect execution of other work items. | ||
249 | |||
250 | @max_active: | 236 | @max_active: |
251 | 237 | ||
252 | @max_active determines the maximum number of execution contexts per | 238 | @max_active determines the maximum number of execution contexts per |
@@ -328,20 +314,7 @@ If @max_active == 2, | |||
328 | 35 w2 wakes up and finishes | 314 | 35 w2 wakes up and finishes |
329 | 315 | ||
330 | Now, let's assume w1 and w2 are queued to a different wq q1 which has | 316 | Now, let's assume w1 and w2 are queued to a different wq q1 which has |
331 | WQ_HIGHPRI set, | 317 | WQ_CPU_INTENSIVE set, |
332 | |||
333 | TIME IN MSECS EVENT | ||
334 | 0 w1 and w2 start and burn CPU | ||
335 | 5 w1 sleeps | ||
336 | 10 w2 sleeps | ||
337 | 10 w0 starts and burns CPU | ||
338 | 15 w0 sleeps | ||
339 | 15 w1 wakes up and finishes | ||
340 | 20 w2 wakes up and finishes | ||
341 | 25 w0 wakes up and burns CPU | ||
342 | 30 w0 finishes | ||
343 | |||
344 | If q1 has WQ_CPU_INTENSIVE set, | ||
345 | 318 | ||
346 | TIME IN MSECS EVENT | 319 | TIME IN MSECS EVENT |
347 | 0 w0 starts and burns CPU | 320 | 0 w0 starts and burns CPU |
diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2e9b9ebbeb78..ce7a074f2519 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h | |||
@@ -73,8 +73,9 @@ enum { | |||
73 | /* migration should happen before other stuff but after perf */ | 73 | /* migration should happen before other stuff but after perf */ |
74 | CPU_PRI_PERF = 20, | 74 | CPU_PRI_PERF = 20, |
75 | CPU_PRI_MIGRATION = 10, | 75 | CPU_PRI_MIGRATION = 10, |
76 | /* prepare workqueues for other notifiers */ | 76 | /* bring up workqueues before normal notifiers and down after */ |
77 | CPU_PRI_WORKQUEUE = 5, | 77 | CPU_PRI_WORKQUEUE_UP = 5, |
78 | CPU_PRI_WORKQUEUE_DOWN = -5, | ||
78 | }; | 79 | }; |
79 | 80 | ||
80 | #define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ | 81 | #define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ |
diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 0714b24c0e45..22ccf9dee177 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h | |||
@@ -49,8 +49,6 @@ extern int tsk_fork_get_node(struct task_struct *tsk); | |||
49 | * can be queued and flushed using queue/flush_kthread_work() | 49 | * can be queued and flushed using queue/flush_kthread_work() |
50 | * respectively. Queued kthread_works are processed by a kthread | 50 | * respectively. Queued kthread_works are processed by a kthread |
51 | * running kthread_worker_fn(). | 51 | * running kthread_worker_fn(). |
52 | * | ||
53 | * A kthread_work can't be freed while it is executing. | ||
54 | */ | 52 | */ |
55 | struct kthread_work; | 53 | struct kthread_work; |
56 | typedef void (*kthread_work_func_t)(struct kthread_work *work); | 54 | typedef void (*kthread_work_func_t)(struct kthread_work *work); |
@@ -59,15 +57,14 @@ struct kthread_worker { | |||
59 | spinlock_t lock; | 57 | spinlock_t lock; |
60 | struct list_head work_list; | 58 | struct list_head work_list; |
61 | struct task_struct *task; | 59 | struct task_struct *task; |
60 | struct kthread_work *current_work; | ||
62 | }; | 61 | }; |
63 | 62 | ||
64 | struct kthread_work { | 63 | struct kthread_work { |
65 | struct list_head node; | 64 | struct list_head node; |
66 | kthread_work_func_t func; | 65 | kthread_work_func_t func; |
67 | wait_queue_head_t done; | 66 | wait_queue_head_t done; |
68 | atomic_t flushing; | 67 | struct kthread_worker *worker; |
69 | int queue_seq; | ||
70 | int done_seq; | ||
71 | }; | 68 | }; |
72 | 69 | ||
73 | #define KTHREAD_WORKER_INIT(worker) { \ | 70 | #define KTHREAD_WORKER_INIT(worker) { \ |
@@ -79,7 +76,6 @@ struct kthread_work { | |||
79 | .node = LIST_HEAD_INIT((work).node), \ | 76 | .node = LIST_HEAD_INIT((work).node), \ |
80 | .func = (fn), \ | 77 | .func = (fn), \ |
81 | .done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done), \ | 78 | .done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done), \ |
82 | .flushing = ATOMIC_INIT(0), \ | ||
83 | } | 79 | } |
84 | 80 | ||
85 | #define DEFINE_KTHREAD_WORKER(worker) \ | 81 | #define DEFINE_KTHREAD_WORKER(worker) \ |
diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 4018f5058f27..f28d1b65f178 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h | |||
@@ -54,7 +54,7 @@ TRACE_EVENT(workqueue_queue_work, | |||
54 | __entry->function = work->func; | 54 | __entry->function = work->func; |
55 | __entry->workqueue = cwq->wq; | 55 | __entry->workqueue = cwq->wq; |
56 | __entry->req_cpu = req_cpu; | 56 | __entry->req_cpu = req_cpu; |
57 | __entry->cpu = cwq->gcwq->cpu; | 57 | __entry->cpu = cwq->pool->gcwq->cpu; |
58 | ), | 58 | ), |
59 | 59 | ||
60 | TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u", | 60 | TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u", |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de633702e..b579af57ea10 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -360,16 +360,12 @@ repeat: | |||
360 | struct kthread_work, node); | 360 | struct kthread_work, node); |
361 | list_del_init(&work->node); | 361 | list_del_init(&work->node); |
362 | } | 362 | } |
363 | worker->current_work = work; | ||
363 | spin_unlock_irq(&worker->lock); | 364 | spin_unlock_irq(&worker->lock); |
364 | 365 | ||
365 | if (work) { | 366 | if (work) { |
366 | __set_current_state(TASK_RUNNING); | 367 | __set_current_state(TASK_RUNNING); |
367 | work->func(work); | 368 | work->func(work); |
368 | smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ | ||
369 | work->done_seq = work->queue_seq; | ||
370 | smp_mb(); /* mb worker-b1 paired with flush-b0 */ | ||
371 | if (atomic_read(&work->flushing)) | ||
372 | wake_up_all(&work->done); | ||
373 | } else if (!freezing(current)) | 369 | } else if (!freezing(current)) |
374 | schedule(); | 370 | schedule(); |
375 | 371 | ||
@@ -378,6 +374,19 @@ repeat: | |||
378 | } | 374 | } |
379 | EXPORT_SYMBOL_GPL(kthread_worker_fn); | 375 | EXPORT_SYMBOL_GPL(kthread_worker_fn); |
380 | 376 | ||
377 | /* insert @work before @pos in @worker */ | ||
378 | static void insert_kthread_work(struct kthread_worker *worker, | ||
379 | struct kthread_work *work, | ||
380 | struct list_head *pos) | ||
381 | { | ||
382 | lockdep_assert_held(&worker->lock); | ||
383 | |||
384 | list_add_tail(&work->node, pos); | ||
385 | work->worker = worker; | ||
386 | if (likely(worker->task)) | ||
387 | wake_up_process(worker->task); | ||
388 | } | ||
389 | |||
381 | /** | 390 | /** |
382 | * queue_kthread_work - queue a kthread_work | 391 | * queue_kthread_work - queue a kthread_work |
383 | * @worker: target kthread_worker | 392 | * @worker: target kthread_worker |
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker, | |||
395 | 404 | ||
396 | spin_lock_irqsave(&worker->lock, flags); | 405 | spin_lock_irqsave(&worker->lock, flags); |
397 | if (list_empty(&work->node)) { | 406 | if (list_empty(&work->node)) { |
398 | list_add_tail(&work->node, &worker->work_list); | 407 | insert_kthread_work(worker, work, &worker->work_list); |
399 | work->queue_seq++; | ||
400 | if (likely(worker->task)) | ||
401 | wake_up_process(worker->task); | ||
402 | ret = true; | 408 | ret = true; |
403 | } | 409 | } |
404 | spin_unlock_irqrestore(&worker->lock, flags); | 410 | spin_unlock_irqrestore(&worker->lock, flags); |
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker, | |||
406 | } | 412 | } |
407 | EXPORT_SYMBOL_GPL(queue_kthread_work); | 413 | EXPORT_SYMBOL_GPL(queue_kthread_work); |
408 | 414 | ||
415 | struct kthread_flush_work { | ||
416 | struct kthread_work work; | ||
417 | struct completion done; | ||
418 | }; | ||
419 | |||
420 | static void kthread_flush_work_fn(struct kthread_work *work) | ||
421 | { | ||
422 | struct kthread_flush_work *fwork = | ||
423 | container_of(work, struct kthread_flush_work, work); | ||
424 | complete(&fwork->done); | ||
425 | } | ||
426 | |||
409 | /** | 427 | /** |
410 | * flush_kthread_work - flush a kthread_work | 428 | * flush_kthread_work - flush a kthread_work |
411 | * @work: work to flush | 429 | * @work: work to flush |
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work); | |||
414 | */ | 432 | */ |
415 | void flush_kthread_work(struct kthread_work *work) | 433 | void flush_kthread_work(struct kthread_work *work) |
416 | { | 434 | { |
417 | int seq = work->queue_seq; | 435 | struct kthread_flush_work fwork = { |
418 | 436 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | |
419 | atomic_inc(&work->flushing); | 437 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), |
438 | }; | ||
439 | struct kthread_worker *worker; | ||
440 | bool noop = false; | ||
420 | 441 | ||
421 | /* | 442 | retry: |
422 | * mb flush-b0 paired with worker-b1, to make sure either | 443 | worker = work->worker; |
423 | * worker sees the above increment or we see done_seq update. | 444 | if (!worker) |
424 | */ | 445 | return; |
425 | smp_mb__after_atomic_inc(); | ||
426 | 446 | ||
427 | /* A - B <= 0 tests whether B is in front of A regardless of overflow */ | 447 | spin_lock_irq(&worker->lock); |
428 | wait_event(work->done, seq - work->done_seq <= 0); | 448 | if (work->worker != worker) { |
429 | atomic_dec(&work->flushing); | 449 | spin_unlock_irq(&worker->lock); |
450 | goto retry; | ||
451 | } | ||
430 | 452 | ||
431 | /* | 453 | if (!list_empty(&work->node)) |
432 | * rmb flush-b1 paired with worker-b0, to make sure our caller | 454 | insert_kthread_work(worker, &fwork.work, work->node.next); |
433 | * sees every change made by work->func(). | 455 | else if (worker->current_work == work) |
434 | */ | 456 | insert_kthread_work(worker, &fwork.work, worker->work_list.next); |
435 | smp_mb__after_atomic_dec(); | 457 | else |
436 | } | 458 | noop = true; |
437 | EXPORT_SYMBOL_GPL(flush_kthread_work); | ||
438 | 459 | ||
439 | struct kthread_flush_work { | 460 | spin_unlock_irq(&worker->lock); |
440 | struct kthread_work work; | ||
441 | struct completion done; | ||
442 | }; | ||
443 | 461 | ||
444 | static void kthread_flush_work_fn(struct kthread_work *work) | 462 | if (!noop) |
445 | { | 463 | wait_for_completion(&fwork.done); |
446 | struct kthread_flush_work *fwork = | ||
447 | container_of(work, struct kthread_flush_work, work); | ||
448 | complete(&fwork->done); | ||
449 | } | 464 | } |
465 | EXPORT_SYMBOL_GPL(flush_kthread_work); | ||
450 | 466 | ||
451 | /** | 467 | /** |
452 | * flush_kthread_worker - flush all current works on a kthread_worker | 468 | * flush_kthread_worker - flush all current works on a kthread_worker |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a3128dc67df..692d97628a10 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -45,32 +45,41 @@ | |||
45 | #include "workqueue_sched.h" | 45 | #include "workqueue_sched.h" |
46 | 46 | ||
47 | enum { | 47 | enum { |
48 | /* global_cwq flags */ | 48 | /* |
49 | GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | 49 | * global_cwq flags |
50 | GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ | 50 | * |
51 | GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | 51 | * A bound gcwq is either associated or disassociated with its CPU. |
52 | GCWQ_FREEZING = 1 << 3, /* freeze in progress */ | 52 | * While associated (!DISASSOCIATED), all workers are bound to the |
53 | GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ | 53 | * CPU and none has %WORKER_UNBOUND set and concurrency management |
54 | * is in effect. | ||
55 | * | ||
56 | * While DISASSOCIATED, the cpu may be offline and all workers have | ||
57 | * %WORKER_UNBOUND set and concurrency management disabled, and may | ||
58 | * be executing on any CPU. The gcwq behaves as an unbound one. | ||
59 | * | ||
60 | * Note that DISASSOCIATED can be flipped only while holding | ||
61 | * managership of all pools on the gcwq to avoid changing binding | ||
62 | * state while create_worker() is in progress. | ||
63 | */ | ||
64 | GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ | ||
65 | GCWQ_FREEZING = 1 << 1, /* freeze in progress */ | ||
66 | |||
67 | /* pool flags */ | ||
68 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | ||
54 | 69 | ||
55 | /* worker flags */ | 70 | /* worker flags */ |
56 | WORKER_STARTED = 1 << 0, /* started */ | 71 | WORKER_STARTED = 1 << 0, /* started */ |
57 | WORKER_DIE = 1 << 1, /* die die die */ | 72 | WORKER_DIE = 1 << 1, /* die die die */ |
58 | WORKER_IDLE = 1 << 2, /* is idle */ | 73 | WORKER_IDLE = 1 << 2, /* is idle */ |
59 | WORKER_PREP = 1 << 3, /* preparing to run works */ | 74 | WORKER_PREP = 1 << 3, /* preparing to run works */ |
60 | WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ | ||
61 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ | 75 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ |
62 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ | 76 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ |
63 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ | 77 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ |
64 | 78 | ||
65 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | | 79 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | |
66 | WORKER_CPU_INTENSIVE | WORKER_UNBOUND, | 80 | WORKER_CPU_INTENSIVE, |
67 | 81 | ||
68 | /* gcwq->trustee_state */ | 82 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ |
69 | TRUSTEE_START = 0, /* start */ | ||
70 | TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ | ||
71 | TRUSTEE_BUTCHER = 2, /* butcher workers */ | ||
72 | TRUSTEE_RELEASE = 3, /* release workers */ | ||
73 | TRUSTEE_DONE = 4, /* trustee is done */ | ||
74 | 83 | ||
75 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ | 84 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ |
76 | BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, | 85 | BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, |
@@ -84,13 +93,13 @@ enum { | |||
84 | (min two ticks) */ | 93 | (min two ticks) */ |
85 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | 94 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ |
86 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | 95 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ |
87 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | ||
88 | 96 | ||
89 | /* | 97 | /* |
90 | * Rescue workers are used only on emergencies and shared by | 98 | * Rescue workers are used only on emergencies and shared by |
91 | * all cpus. Give -20. | 99 | * all cpus. Give -20. |
92 | */ | 100 | */ |
93 | RESCUER_NICE_LEVEL = -20, | 101 | RESCUER_NICE_LEVEL = -20, |
102 | HIGHPRI_NICE_LEVEL = -20, | ||
94 | }; | 103 | }; |
95 | 104 | ||
96 | /* | 105 | /* |
@@ -115,6 +124,8 @@ enum { | |||
115 | */ | 124 | */ |
116 | 125 | ||
117 | struct global_cwq; | 126 | struct global_cwq; |
127 | struct worker_pool; | ||
128 | struct idle_rebind; | ||
118 | 129 | ||
119 | /* | 130 | /* |
120 | * The poor guys doing the actual heavy lifting. All on-duty workers | 131 | * The poor guys doing the actual heavy lifting. All on-duty workers |
@@ -131,12 +142,31 @@ struct worker { | |||
131 | struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ | 142 | struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ |
132 | struct list_head scheduled; /* L: scheduled works */ | 143 | struct list_head scheduled; /* L: scheduled works */ |
133 | struct task_struct *task; /* I: worker task */ | 144 | struct task_struct *task; /* I: worker task */ |
134 | struct global_cwq *gcwq; /* I: the associated gcwq */ | 145 | struct worker_pool *pool; /* I: the associated pool */ |
135 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | 146 | /* 64 bytes boundary on 64bit, 32 on 32bit */ |
136 | unsigned long last_active; /* L: last active timestamp */ | 147 | unsigned long last_active; /* L: last active timestamp */ |
137 | unsigned int flags; /* X: flags */ | 148 | unsigned int flags; /* X: flags */ |
138 | int id; /* I: worker id */ | 149 | int id; /* I: worker id */ |
139 | struct work_struct rebind_work; /* L: rebind worker to cpu */ | 150 | |
151 | /* for rebinding worker to CPU */ | ||
152 | struct idle_rebind *idle_rebind; /* L: for idle worker */ | ||
153 | struct work_struct rebind_work; /* L: for busy worker */ | ||
154 | }; | ||
155 | |||
156 | struct worker_pool { | ||
157 | struct global_cwq *gcwq; /* I: the owning gcwq */ | ||
158 | unsigned int flags; /* X: flags */ | ||
159 | |||
160 | struct list_head worklist; /* L: list of pending works */ | ||
161 | int nr_workers; /* L: total number of workers */ | ||
162 | int nr_idle; /* L: currently idle ones */ | ||
163 | |||
164 | struct list_head idle_list; /* X: list of idle workers */ | ||
165 | struct timer_list idle_timer; /* L: worker idle timeout */ | ||
166 | struct timer_list mayday_timer; /* L: SOS timer for workers */ | ||
167 | |||
168 | struct mutex manager_mutex; /* mutex manager should hold */ | ||
169 | struct ida worker_ida; /* L: for worker IDs */ | ||
140 | }; | 170 | }; |
141 | 171 | ||
142 | /* | 172 | /* |
@@ -146,27 +176,16 @@ struct worker { | |||
146 | */ | 176 | */ |
147 | struct global_cwq { | 177 | struct global_cwq { |
148 | spinlock_t lock; /* the gcwq lock */ | 178 | spinlock_t lock; /* the gcwq lock */ |
149 | struct list_head worklist; /* L: list of pending works */ | ||
150 | unsigned int cpu; /* I: the associated cpu */ | 179 | unsigned int cpu; /* I: the associated cpu */ |
151 | unsigned int flags; /* L: GCWQ_* flags */ | 180 | unsigned int flags; /* L: GCWQ_* flags */ |
152 | 181 | ||
153 | int nr_workers; /* L: total number of workers */ | 182 | /* workers are chained either in busy_hash or pool idle_list */ |
154 | int nr_idle; /* L: currently idle ones */ | ||
155 | |||
156 | /* workers are chained either in the idle_list or busy_hash */ | ||
157 | struct list_head idle_list; /* X: list of idle workers */ | ||
158 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; | 183 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; |
159 | /* L: hash of busy workers */ | 184 | /* L: hash of busy workers */ |
160 | 185 | ||
161 | struct timer_list idle_timer; /* L: worker idle timeout */ | 186 | struct worker_pool pools[2]; /* normal and highpri pools */ |
162 | struct timer_list mayday_timer; /* L: SOS timer for dworkers */ | ||
163 | |||
164 | struct ida worker_ida; /* L: for worker IDs */ | ||
165 | 187 | ||
166 | struct task_struct *trustee; /* L: for gcwq shutdown */ | 188 | wait_queue_head_t rebind_hold; /* rebind hold wait */ |
167 | unsigned int trustee_state; /* L: trustee state */ | ||
168 | wait_queue_head_t trustee_wait; /* trustee wait */ | ||
169 | struct worker *first_idle; /* L: first idle worker */ | ||
170 | } ____cacheline_aligned_in_smp; | 189 | } ____cacheline_aligned_in_smp; |
171 | 190 | ||
172 | /* | 191 | /* |
@@ -175,7 +194,7 @@ struct global_cwq { | |||
175 | * aligned at two's power of the number of flag bits. | 194 | * aligned at two's power of the number of flag bits. |
176 | */ | 195 | */ |
177 | struct cpu_workqueue_struct { | 196 | struct cpu_workqueue_struct { |
178 | struct global_cwq *gcwq; /* I: the associated gcwq */ | 197 | struct worker_pool *pool; /* I: the associated pool */ |
179 | struct workqueue_struct *wq; /* I: the owning workqueue */ | 198 | struct workqueue_struct *wq; /* I: the owning workqueue */ |
180 | int work_color; /* L: current color */ | 199 | int work_color; /* L: current color */ |
181 | int flush_color; /* L: flushing color */ | 200 | int flush_color; /* L: flushing color */ |
@@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); | |||
264 | #define CREATE_TRACE_POINTS | 283 | #define CREATE_TRACE_POINTS |
265 | #include <trace/events/workqueue.h> | 284 | #include <trace/events/workqueue.h> |
266 | 285 | ||
286 | #define for_each_worker_pool(pool, gcwq) \ | ||
287 | for ((pool) = &(gcwq)->pools[0]; \ | ||
288 | (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) | ||
289 | |||
267 | #define for_each_busy_worker(worker, i, pos, gcwq) \ | 290 | #define for_each_busy_worker(worker, i, pos, gcwq) \ |
268 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ | 291 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ |
269 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) | 292 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) |
@@ -444,7 +467,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ | |||
444 | * try_to_wake_up(). Put it in a separate cacheline. | 467 | * try_to_wake_up(). Put it in a separate cacheline. |
445 | */ | 468 | */ |
446 | static DEFINE_PER_CPU(struct global_cwq, global_cwq); | 469 | static DEFINE_PER_CPU(struct global_cwq, global_cwq); |
447 | static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); | 470 | static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); |
448 | 471 | ||
449 | /* | 472 | /* |
450 | * Global cpu workqueue and nr_running counter for unbound gcwq. The | 473 | * Global cpu workqueue and nr_running counter for unbound gcwq. The |
@@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); | |||
452 | * workers have WORKER_UNBOUND set. | 475 | * workers have WORKER_UNBOUND set. |
453 | */ | 476 | */ |
454 | static struct global_cwq unbound_global_cwq; | 477 | static struct global_cwq unbound_global_cwq; |
455 | static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ | 478 | static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { |
479 | [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ | ||
480 | }; | ||
456 | 481 | ||
457 | static int worker_thread(void *__worker); | 482 | static int worker_thread(void *__worker); |
458 | 483 | ||
484 | static int worker_pool_pri(struct worker_pool *pool) | ||
485 | { | ||
486 | return pool - pool->gcwq->pools; | ||
487 | } | ||
488 | |||
459 | static struct global_cwq *get_gcwq(unsigned int cpu) | 489 | static struct global_cwq *get_gcwq(unsigned int cpu) |
460 | { | 490 | { |
461 | if (cpu != WORK_CPU_UNBOUND) | 491 | if (cpu != WORK_CPU_UNBOUND) |
@@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu) | |||
464 | return &unbound_global_cwq; | 494 | return &unbound_global_cwq; |
465 | } | 495 | } |
466 | 496 | ||
467 | static atomic_t *get_gcwq_nr_running(unsigned int cpu) | 497 | static atomic_t *get_pool_nr_running(struct worker_pool *pool) |
468 | { | 498 | { |
499 | int cpu = pool->gcwq->cpu; | ||
500 | int idx = worker_pool_pri(pool); | ||
501 | |||
469 | if (cpu != WORK_CPU_UNBOUND) | 502 | if (cpu != WORK_CPU_UNBOUND) |
470 | return &per_cpu(gcwq_nr_running, cpu); | 503 | return &per_cpu(pool_nr_running, cpu)[idx]; |
471 | else | 504 | else |
472 | return &unbound_gcwq_nr_running; | 505 | return &unbound_pool_nr_running[idx]; |
473 | } | 506 | } |
474 | 507 | ||
475 | static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, | 508 | static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, |
@@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) | |||
555 | 588 | ||
556 | if (data & WORK_STRUCT_CWQ) | 589 | if (data & WORK_STRUCT_CWQ) |
557 | return ((struct cpu_workqueue_struct *) | 590 | return ((struct cpu_workqueue_struct *) |
558 | (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; | 591 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; |
559 | 592 | ||
560 | cpu = data >> WORK_STRUCT_FLAG_BITS; | 593 | cpu = data >> WORK_STRUCT_FLAG_BITS; |
561 | if (cpu == WORK_CPU_NONE) | 594 | if (cpu == WORK_CPU_NONE) |
@@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) | |||
566 | } | 599 | } |
567 | 600 | ||
568 | /* | 601 | /* |
569 | * Policy functions. These define the policies on how the global | 602 | * Policy functions. These define the policies on how the global worker |
570 | * worker pool is managed. Unless noted otherwise, these functions | 603 | * pools are managed. Unless noted otherwise, these functions assume that |
571 | * assume that they're being called with gcwq->lock held. | 604 | * they're being called with gcwq->lock held. |
572 | */ | 605 | */ |
573 | 606 | ||
574 | static bool __need_more_worker(struct global_cwq *gcwq) | 607 | static bool __need_more_worker(struct worker_pool *pool) |
575 | { | 608 | { |
576 | return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || | 609 | return !atomic_read(get_pool_nr_running(pool)); |
577 | gcwq->flags & GCWQ_HIGHPRI_PENDING; | ||
578 | } | 610 | } |
579 | 611 | ||
580 | /* | 612 | /* |
581 | * Need to wake up a worker? Called from anything but currently | 613 | * Need to wake up a worker? Called from anything but currently |
582 | * running workers. | 614 | * running workers. |
615 | * | ||
616 | * Note that, because unbound workers never contribute to nr_running, this | ||
617 | * function will always return %true for unbound gcwq as long as the | ||
618 | * worklist isn't empty. | ||
583 | */ | 619 | */ |
584 | static bool need_more_worker(struct global_cwq *gcwq) | 620 | static bool need_more_worker(struct worker_pool *pool) |
585 | { | 621 | { |
586 | return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); | 622 | return !list_empty(&pool->worklist) && __need_more_worker(pool); |
587 | } | 623 | } |
588 | 624 | ||
589 | /* Can I start working? Called from busy but !running workers. */ | 625 | /* Can I start working? Called from busy but !running workers. */ |
590 | static bool may_start_working(struct global_cwq *gcwq) | 626 | static bool may_start_working(struct worker_pool *pool) |
591 | { | 627 | { |
592 | return gcwq->nr_idle; | 628 | return pool->nr_idle; |
593 | } | 629 | } |
594 | 630 | ||
595 | /* Do I need to keep working? Called from currently running workers. */ | 631 | /* Do I need to keep working? Called from currently running workers. */ |
596 | static bool keep_working(struct global_cwq *gcwq) | 632 | static bool keep_working(struct worker_pool *pool) |
597 | { | 633 | { |
598 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | 634 | atomic_t *nr_running = get_pool_nr_running(pool); |
599 | 635 | ||
600 | return !list_empty(&gcwq->worklist) && | 636 | return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; |
601 | (atomic_read(nr_running) <= 1 || | ||
602 | gcwq->flags & GCWQ_HIGHPRI_PENDING); | ||
603 | } | 637 | } |
604 | 638 | ||
605 | /* Do we need a new worker? Called from manager. */ | 639 | /* Do we need a new worker? Called from manager. */ |
606 | static bool need_to_create_worker(struct global_cwq *gcwq) | 640 | static bool need_to_create_worker(struct worker_pool *pool) |
607 | { | 641 | { |
608 | return need_more_worker(gcwq) && !may_start_working(gcwq); | 642 | return need_more_worker(pool) && !may_start_working(pool); |
609 | } | 643 | } |
610 | 644 | ||
611 | /* Do I need to be the manager? */ | 645 | /* Do I need to be the manager? */ |
612 | static bool need_to_manage_workers(struct global_cwq *gcwq) | 646 | static bool need_to_manage_workers(struct worker_pool *pool) |
613 | { | 647 | { |
614 | return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; | 648 | return need_to_create_worker(pool) || |
649 | (pool->flags & POOL_MANAGE_WORKERS); | ||
615 | } | 650 | } |
616 | 651 | ||
617 | /* Do we have too many workers and should some go away? */ | 652 | /* Do we have too many workers and should some go away? */ |
618 | static bool too_many_workers(struct global_cwq *gcwq) | 653 | static bool too_many_workers(struct worker_pool *pool) |
619 | { | 654 | { |
620 | bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; | 655 | bool managing = mutex_is_locked(&pool->manager_mutex); |
621 | int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ | 656 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
622 | int nr_busy = gcwq->nr_workers - nr_idle; | 657 | int nr_busy = pool->nr_workers - nr_idle; |
623 | 658 | ||
624 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; | 659 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; |
625 | } | 660 | } |
@@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq) | |||
629 | */ | 664 | */ |
630 | 665 | ||
631 | /* Return the first worker. Safe with preemption disabled */ | 666 | /* Return the first worker. Safe with preemption disabled */ |
632 | static struct worker *first_worker(struct global_cwq *gcwq) | 667 | static struct worker *first_worker(struct worker_pool *pool) |
633 | { | 668 | { |
634 | if (unlikely(list_empty(&gcwq->idle_list))) | 669 | if (unlikely(list_empty(&pool->idle_list))) |
635 | return NULL; | 670 | return NULL; |
636 | 671 | ||
637 | return list_first_entry(&gcwq->idle_list, struct worker, entry); | 672 | return list_first_entry(&pool->idle_list, struct worker, entry); |
638 | } | 673 | } |
639 | 674 | ||
640 | /** | 675 | /** |
641 | * wake_up_worker - wake up an idle worker | 676 | * wake_up_worker - wake up an idle worker |
642 | * @gcwq: gcwq to wake worker for | 677 | * @pool: worker pool to wake worker from |
643 | * | 678 | * |
644 | * Wake up the first idle worker of @gcwq. | 679 | * Wake up the first idle worker of @pool. |
645 | * | 680 | * |
646 | * CONTEXT: | 681 | * CONTEXT: |
647 | * spin_lock_irq(gcwq->lock). | 682 | * spin_lock_irq(gcwq->lock). |
648 | */ | 683 | */ |
649 | static void wake_up_worker(struct global_cwq *gcwq) | 684 | static void wake_up_worker(struct worker_pool *pool) |
650 | { | 685 | { |
651 | struct worker *worker = first_worker(gcwq); | 686 | struct worker *worker = first_worker(pool); |
652 | 687 | ||
653 | if (likely(worker)) | 688 | if (likely(worker)) |
654 | wake_up_process(worker->task); | 689 | wake_up_process(worker->task); |
@@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
670 | struct worker *worker = kthread_data(task); | 705 | struct worker *worker = kthread_data(task); |
671 | 706 | ||
672 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 707 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
673 | atomic_inc(get_gcwq_nr_running(cpu)); | 708 | atomic_inc(get_pool_nr_running(worker->pool)); |
674 | } | 709 | } |
675 | 710 | ||
676 | /** | 711 | /** |
@@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
692 | unsigned int cpu) | 727 | unsigned int cpu) |
693 | { | 728 | { |
694 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; | 729 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; |
695 | struct global_cwq *gcwq = get_gcwq(cpu); | 730 | struct worker_pool *pool = worker->pool; |
696 | atomic_t *nr_running = get_gcwq_nr_running(cpu); | 731 | atomic_t *nr_running = get_pool_nr_running(pool); |
697 | 732 | ||
698 | if (worker->flags & WORKER_NOT_RUNNING) | 733 | if (worker->flags & WORKER_NOT_RUNNING) |
699 | return NULL; | 734 | return NULL; |
@@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
706 | * worklist not empty test sequence is in insert_work(). | 741 | * worklist not empty test sequence is in insert_work(). |
707 | * Please read comment there. | 742 | * Please read comment there. |
708 | * | 743 | * |
709 | * NOT_RUNNING is clear. This means that trustee is not in | 744 | * NOT_RUNNING is clear. This means that we're bound to and |
710 | * charge and we're running on the local cpu w/ rq lock held | 745 | * running on the local cpu w/ rq lock held and preemption |
711 | * and preemption disabled, which in turn means that none else | 746 | * disabled, which in turn means that none else could be |
712 | * could be manipulating idle_list, so dereferencing idle_list | 747 | * manipulating idle_list, so dereferencing idle_list without gcwq |
713 | * without gcwq lock is safe. | 748 | * lock is safe. |
714 | */ | 749 | */ |
715 | if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) | 750 | if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) |
716 | to_wakeup = first_worker(gcwq); | 751 | to_wakeup = first_worker(pool); |
717 | return to_wakeup ? to_wakeup->task : NULL; | 752 | return to_wakeup ? to_wakeup->task : NULL; |
718 | } | 753 | } |
719 | 754 | ||
@@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
733 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, | 768 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, |
734 | bool wakeup) | 769 | bool wakeup) |
735 | { | 770 | { |
736 | struct global_cwq *gcwq = worker->gcwq; | 771 | struct worker_pool *pool = worker->pool; |
737 | 772 | ||
738 | WARN_ON_ONCE(worker->task != current); | 773 | WARN_ON_ONCE(worker->task != current); |
739 | 774 | ||
@@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, | |||
744 | */ | 779 | */ |
745 | if ((flags & WORKER_NOT_RUNNING) && | 780 | if ((flags & WORKER_NOT_RUNNING) && |
746 | !(worker->flags & WORKER_NOT_RUNNING)) { | 781 | !(worker->flags & WORKER_NOT_RUNNING)) { |
747 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | 782 | atomic_t *nr_running = get_pool_nr_running(pool); |
748 | 783 | ||
749 | if (wakeup) { | 784 | if (wakeup) { |
750 | if (atomic_dec_and_test(nr_running) && | 785 | if (atomic_dec_and_test(nr_running) && |
751 | !list_empty(&gcwq->worklist)) | 786 | !list_empty(&pool->worklist)) |
752 | wake_up_worker(gcwq); | 787 | wake_up_worker(pool); |
753 | } else | 788 | } else |
754 | atomic_dec(nr_running); | 789 | atomic_dec(nr_running); |
755 | } | 790 | } |
@@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, | |||
769 | */ | 804 | */ |
770 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | 805 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) |
771 | { | 806 | { |
772 | struct global_cwq *gcwq = worker->gcwq; | 807 | struct worker_pool *pool = worker->pool; |
773 | unsigned int oflags = worker->flags; | 808 | unsigned int oflags = worker->flags; |
774 | 809 | ||
775 | WARN_ON_ONCE(worker->task != current); | 810 | WARN_ON_ONCE(worker->task != current); |
@@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
783 | */ | 818 | */ |
784 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | 819 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) |
785 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 820 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
786 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | 821 | atomic_inc(get_pool_nr_running(pool)); |
787 | } | 822 | } |
788 | 823 | ||
789 | /** | 824 | /** |
@@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | |||
867 | } | 902 | } |
868 | 903 | ||
869 | /** | 904 | /** |
870 | * gcwq_determine_ins_pos - find insertion position | ||
871 | * @gcwq: gcwq of interest | ||
872 | * @cwq: cwq a work is being queued for | ||
873 | * | ||
874 | * A work for @cwq is about to be queued on @gcwq, determine insertion | ||
875 | * position for the work. If @cwq is for HIGHPRI wq, the work is | ||
876 | * queued at the head of the queue but in FIFO order with respect to | ||
877 | * other HIGHPRI works; otherwise, at the end of the queue. This | ||
878 | * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that | ||
879 | * there are HIGHPRI works pending. | ||
880 | * | ||
881 | * CONTEXT: | ||
882 | * spin_lock_irq(gcwq->lock). | ||
883 | * | ||
884 | * RETURNS: | ||
885 | * Pointer to inserstion position. | ||
886 | */ | ||
887 | static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, | ||
888 | struct cpu_workqueue_struct *cwq) | ||
889 | { | ||
890 | struct work_struct *twork; | ||
891 | |||
892 | if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) | ||
893 | return &gcwq->worklist; | ||
894 | |||
895 | list_for_each_entry(twork, &gcwq->worklist, entry) { | ||
896 | struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); | ||
897 | |||
898 | if (!(tcwq->wq->flags & WQ_HIGHPRI)) | ||
899 | break; | ||
900 | } | ||
901 | |||
902 | gcwq->flags |= GCWQ_HIGHPRI_PENDING; | ||
903 | return &twork->entry; | ||
904 | } | ||
905 | |||
906 | /** | ||
907 | * insert_work - insert a work into gcwq | 905 | * insert_work - insert a work into gcwq |
908 | * @cwq: cwq @work belongs to | 906 | * @cwq: cwq @work belongs to |
909 | * @work: work to insert | 907 | * @work: work to insert |
@@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
920 | struct work_struct *work, struct list_head *head, | 918 | struct work_struct *work, struct list_head *head, |
921 | unsigned int extra_flags) | 919 | unsigned int extra_flags) |
922 | { | 920 | { |
923 | struct global_cwq *gcwq = cwq->gcwq; | 921 | struct worker_pool *pool = cwq->pool; |
924 | 922 | ||
925 | /* we own @work, set data and link */ | 923 | /* we own @work, set data and link */ |
926 | set_work_cwq(work, cwq, extra_flags); | 924 | set_work_cwq(work, cwq, extra_flags); |
@@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
940 | */ | 938 | */ |
941 | smp_mb(); | 939 | smp_mb(); |
942 | 940 | ||
943 | if (__need_more_worker(gcwq)) | 941 | if (__need_more_worker(pool)) |
944 | wake_up_worker(gcwq); | 942 | wake_up_worker(pool); |
945 | } | 943 | } |
946 | 944 | ||
947 | /* | 945 | /* |
@@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1043 | if (likely(cwq->nr_active < cwq->max_active)) { | 1041 | if (likely(cwq->nr_active < cwq->max_active)) { |
1044 | trace_workqueue_activate_work(work); | 1042 | trace_workqueue_activate_work(work); |
1045 | cwq->nr_active++; | 1043 | cwq->nr_active++; |
1046 | worklist = gcwq_determine_ins_pos(gcwq, cwq); | 1044 | worklist = &cwq->pool->worklist; |
1047 | } else { | 1045 | } else { |
1048 | work_flags |= WORK_STRUCT_DELAYED; | 1046 | work_flags |= WORK_STRUCT_DELAYED; |
1049 | worklist = &cwq->delayed_works; | 1047 | worklist = &cwq->delayed_works; |
@@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); | |||
1192 | */ | 1190 | */ |
1193 | static void worker_enter_idle(struct worker *worker) | 1191 | static void worker_enter_idle(struct worker *worker) |
1194 | { | 1192 | { |
1195 | struct global_cwq *gcwq = worker->gcwq; | 1193 | struct worker_pool *pool = worker->pool; |
1194 | struct global_cwq *gcwq = pool->gcwq; | ||
1196 | 1195 | ||
1197 | BUG_ON(worker->flags & WORKER_IDLE); | 1196 | BUG_ON(worker->flags & WORKER_IDLE); |
1198 | BUG_ON(!list_empty(&worker->entry) && | 1197 | BUG_ON(!list_empty(&worker->entry) && |
@@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker) | |||
1200 | 1199 | ||
1201 | /* can't use worker_set_flags(), also called from start_worker() */ | 1200 | /* can't use worker_set_flags(), also called from start_worker() */ |
1202 | worker->flags |= WORKER_IDLE; | 1201 | worker->flags |= WORKER_IDLE; |
1203 | gcwq->nr_idle++; | 1202 | pool->nr_idle++; |
1204 | worker->last_active = jiffies; | 1203 | worker->last_active = jiffies; |
1205 | 1204 | ||
1206 | /* idle_list is LIFO */ | 1205 | /* idle_list is LIFO */ |
1207 | list_add(&worker->entry, &gcwq->idle_list); | 1206 | list_add(&worker->entry, &pool->idle_list); |
1208 | 1207 | ||
1209 | if (likely(!(worker->flags & WORKER_ROGUE))) { | 1208 | if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) |
1210 | if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) | 1209 | mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); |
1211 | mod_timer(&gcwq->idle_timer, | ||
1212 | jiffies + IDLE_WORKER_TIMEOUT); | ||
1213 | } else | ||
1214 | wake_up_all(&gcwq->trustee_wait); | ||
1215 | 1210 | ||
1216 | /* | 1211 | /* |
1217 | * Sanity check nr_running. Because trustee releases gcwq->lock | 1212 | * Sanity check nr_running. Because gcwq_unbind_fn() releases |
1218 | * between setting %WORKER_ROGUE and zapping nr_running, the | 1213 | * gcwq->lock between setting %WORKER_UNBOUND and zapping |
1219 | * warning may trigger spuriously. Check iff trustee is idle. | 1214 | * nr_running, the warning may trigger spuriously. Check iff |
1215 | * unbind is not in progress. | ||
1220 | */ | 1216 | */ |
1221 | WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && | 1217 | WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && |
1222 | gcwq->nr_workers == gcwq->nr_idle && | 1218 | pool->nr_workers == pool->nr_idle && |
1223 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | 1219 | atomic_read(get_pool_nr_running(pool))); |
1224 | } | 1220 | } |
1225 | 1221 | ||
1226 | /** | 1222 | /** |
@@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker) | |||
1234 | */ | 1230 | */ |
1235 | static void worker_leave_idle(struct worker *worker) | 1231 | static void worker_leave_idle(struct worker *worker) |
1236 | { | 1232 | { |
1237 | struct global_cwq *gcwq = worker->gcwq; | 1233 | struct worker_pool *pool = worker->pool; |
1238 | 1234 | ||
1239 | BUG_ON(!(worker->flags & WORKER_IDLE)); | 1235 | BUG_ON(!(worker->flags & WORKER_IDLE)); |
1240 | worker_clr_flags(worker, WORKER_IDLE); | 1236 | worker_clr_flags(worker, WORKER_IDLE); |
1241 | gcwq->nr_idle--; | 1237 | pool->nr_idle--; |
1242 | list_del_init(&worker->entry); | 1238 | list_del_init(&worker->entry); |
1243 | } | 1239 | } |
1244 | 1240 | ||
@@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker) | |||
1258 | * verbatim as it's best effort and blocking and gcwq may be | 1254 | * verbatim as it's best effort and blocking and gcwq may be |
1259 | * [dis]associated in the meantime. | 1255 | * [dis]associated in the meantime. |
1260 | * | 1256 | * |
1261 | * This function tries set_cpus_allowed() and locks gcwq and verifies | 1257 | * This function tries set_cpus_allowed() and locks gcwq and verifies the |
1262 | * the binding against GCWQ_DISASSOCIATED which is set during | 1258 | * binding against %GCWQ_DISASSOCIATED which is set during |
1263 | * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters | 1259 | * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker |
1264 | * idle state or fetches works without dropping lock, it can guarantee | 1260 | * enters idle state or fetches works without dropping lock, it can |
1265 | * the scheduling requirement described in the first paragraph. | 1261 | * guarantee the scheduling requirement described in the first paragraph. |
1266 | * | 1262 | * |
1267 | * CONTEXT: | 1263 | * CONTEXT: |
1268 | * Might sleep. Called without any lock but returns with gcwq->lock | 1264 | * Might sleep. Called without any lock but returns with gcwq->lock |
@@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker) | |||
1275 | static bool worker_maybe_bind_and_lock(struct worker *worker) | 1271 | static bool worker_maybe_bind_and_lock(struct worker *worker) |
1276 | __acquires(&gcwq->lock) | 1272 | __acquires(&gcwq->lock) |
1277 | { | 1273 | { |
1278 | struct global_cwq *gcwq = worker->gcwq; | 1274 | struct global_cwq *gcwq = worker->pool->gcwq; |
1279 | struct task_struct *task = worker->task; | 1275 | struct task_struct *task = worker->task; |
1280 | 1276 | ||
1281 | while (true) { | 1277 | while (true) { |
@@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock) | |||
1308 | } | 1304 | } |
1309 | } | 1305 | } |
1310 | 1306 | ||
1307 | struct idle_rebind { | ||
1308 | int cnt; /* # workers to be rebound */ | ||
1309 | struct completion done; /* all workers rebound */ | ||
1310 | }; | ||
1311 | |||
1312 | /* | ||
1313 | * Rebind an idle @worker to its CPU. During CPU onlining, this has to | ||
1314 | * happen synchronously for idle workers. worker_thread() will test | ||
1315 | * %WORKER_REBIND before leaving idle and call this function. | ||
1316 | */ | ||
1317 | static void idle_worker_rebind(struct worker *worker) | ||
1318 | { | ||
1319 | struct global_cwq *gcwq = worker->pool->gcwq; | ||
1320 | |||
1321 | /* CPU must be online at this point */ | ||
1322 | WARN_ON(!worker_maybe_bind_and_lock(worker)); | ||
1323 | if (!--worker->idle_rebind->cnt) | ||
1324 | complete(&worker->idle_rebind->done); | ||
1325 | spin_unlock_irq(&worker->pool->gcwq->lock); | ||
1326 | |||
1327 | /* we did our part, wait for rebind_workers() to finish up */ | ||
1328 | wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); | ||
1329 | } | ||
1330 | |||
1311 | /* | 1331 | /* |
1312 | * Function for worker->rebind_work used to rebind rogue busy workers | 1332 | * Function for @worker->rebind.work used to rebind unbound busy workers to |
1313 | * to the associated cpu which is coming back online. This is | 1333 | * the associated cpu which is coming back online. This is scheduled by |
1314 | * scheduled by cpu up but can race with other cpu hotplug operations | 1334 | * cpu up but can race with other cpu hotplug operations and may be |
1315 | * and may be executed twice without intervening cpu down. | 1335 | * executed twice without intervening cpu down. |
1316 | */ | 1336 | */ |
1317 | static void worker_rebind_fn(struct work_struct *work) | 1337 | static void busy_worker_rebind_fn(struct work_struct *work) |
1318 | { | 1338 | { |
1319 | struct worker *worker = container_of(work, struct worker, rebind_work); | 1339 | struct worker *worker = container_of(work, struct worker, rebind_work); |
1320 | struct global_cwq *gcwq = worker->gcwq; | 1340 | struct global_cwq *gcwq = worker->pool->gcwq; |
1321 | 1341 | ||
1322 | if (worker_maybe_bind_and_lock(worker)) | 1342 | if (worker_maybe_bind_and_lock(worker)) |
1323 | worker_clr_flags(worker, WORKER_REBIND); | 1343 | worker_clr_flags(worker, WORKER_REBIND); |
@@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work) | |||
1325 | spin_unlock_irq(&gcwq->lock); | 1345 | spin_unlock_irq(&gcwq->lock); |
1326 | } | 1346 | } |
1327 | 1347 | ||
1348 | /** | ||
1349 | * rebind_workers - rebind all workers of a gcwq to the associated CPU | ||
1350 | * @gcwq: gcwq of interest | ||
1351 | * | ||
1352 | * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding | ||
1353 | * is different for idle and busy ones. | ||
1354 | * | ||
1355 | * The idle ones should be rebound synchronously and idle rebinding should | ||
1356 | * be complete before any worker starts executing work items with | ||
1357 | * concurrency management enabled; otherwise, scheduler may oops trying to | ||
1358 | * wake up non-local idle worker from wq_worker_sleeping(). | ||
1359 | * | ||
1360 | * This is achieved by repeatedly requesting rebinding until all idle | ||
1361 | * workers are known to have been rebound under @gcwq->lock and holding all | ||
1362 | * idle workers from becoming busy until idle rebinding is complete. | ||
1363 | * | ||
1364 | * Once idle workers are rebound, busy workers can be rebound as they | ||
1365 | * finish executing their current work items. Queueing the rebind work at | ||
1366 | * the head of their scheduled lists is enough. Note that nr_running will | ||
1367 | * be properbly bumped as busy workers rebind. | ||
1368 | * | ||
1369 | * On return, all workers are guaranteed to either be bound or have rebind | ||
1370 | * work item scheduled. | ||
1371 | */ | ||
1372 | static void rebind_workers(struct global_cwq *gcwq) | ||
1373 | __releases(&gcwq->lock) __acquires(&gcwq->lock) | ||
1374 | { | ||
1375 | struct idle_rebind idle_rebind; | ||
1376 | struct worker_pool *pool; | ||
1377 | struct worker *worker; | ||
1378 | struct hlist_node *pos; | ||
1379 | int i; | ||
1380 | |||
1381 | lockdep_assert_held(&gcwq->lock); | ||
1382 | |||
1383 | for_each_worker_pool(pool, gcwq) | ||
1384 | lockdep_assert_held(&pool->manager_mutex); | ||
1385 | |||
1386 | /* | ||
1387 | * Rebind idle workers. Interlocked both ways. We wait for | ||
1388 | * workers to rebind via @idle_rebind.done. Workers will wait for | ||
1389 | * us to finish up by watching %WORKER_REBIND. | ||
1390 | */ | ||
1391 | init_completion(&idle_rebind.done); | ||
1392 | retry: | ||
1393 | idle_rebind.cnt = 1; | ||
1394 | INIT_COMPLETION(idle_rebind.done); | ||
1395 | |||
1396 | /* set REBIND and kick idle ones, we'll wait for these later */ | ||
1397 | for_each_worker_pool(pool, gcwq) { | ||
1398 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
1399 | if (worker->flags & WORKER_REBIND) | ||
1400 | continue; | ||
1401 | |||
1402 | /* morph UNBOUND to REBIND */ | ||
1403 | worker->flags &= ~WORKER_UNBOUND; | ||
1404 | worker->flags |= WORKER_REBIND; | ||
1405 | |||
1406 | idle_rebind.cnt++; | ||
1407 | worker->idle_rebind = &idle_rebind; | ||
1408 | |||
1409 | /* worker_thread() will call idle_worker_rebind() */ | ||
1410 | wake_up_process(worker->task); | ||
1411 | } | ||
1412 | } | ||
1413 | |||
1414 | if (--idle_rebind.cnt) { | ||
1415 | spin_unlock_irq(&gcwq->lock); | ||
1416 | wait_for_completion(&idle_rebind.done); | ||
1417 | spin_lock_irq(&gcwq->lock); | ||
1418 | /* busy ones might have become idle while waiting, retry */ | ||
1419 | goto retry; | ||
1420 | } | ||
1421 | |||
1422 | /* | ||
1423 | * All idle workers are rebound and waiting for %WORKER_REBIND to | ||
1424 | * be cleared inside idle_worker_rebind(). Clear and release. | ||
1425 | * Clearing %WORKER_REBIND from this foreign context is safe | ||
1426 | * because these workers are still guaranteed to be idle. | ||
1427 | */ | ||
1428 | for_each_worker_pool(pool, gcwq) | ||
1429 | list_for_each_entry(worker, &pool->idle_list, entry) | ||
1430 | worker->flags &= ~WORKER_REBIND; | ||
1431 | |||
1432 | wake_up_all(&gcwq->rebind_hold); | ||
1433 | |||
1434 | /* rebind busy workers */ | ||
1435 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
1436 | struct work_struct *rebind_work = &worker->rebind_work; | ||
1437 | |||
1438 | /* morph UNBOUND to REBIND */ | ||
1439 | worker->flags &= ~WORKER_UNBOUND; | ||
1440 | worker->flags |= WORKER_REBIND; | ||
1441 | |||
1442 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | ||
1443 | work_data_bits(rebind_work))) | ||
1444 | continue; | ||
1445 | |||
1446 | /* wq doesn't matter, use the default one */ | ||
1447 | debug_work_activate(rebind_work); | ||
1448 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
1449 | worker->scheduled.next, | ||
1450 | work_color_to_flags(WORK_NO_COLOR)); | ||
1451 | } | ||
1452 | } | ||
1453 | |||
1328 | static struct worker *alloc_worker(void) | 1454 | static struct worker *alloc_worker(void) |
1329 | { | 1455 | { |
1330 | struct worker *worker; | 1456 | struct worker *worker; |
@@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void) | |||
1333 | if (worker) { | 1459 | if (worker) { |
1334 | INIT_LIST_HEAD(&worker->entry); | 1460 | INIT_LIST_HEAD(&worker->entry); |
1335 | INIT_LIST_HEAD(&worker->scheduled); | 1461 | INIT_LIST_HEAD(&worker->scheduled); |
1336 | INIT_WORK(&worker->rebind_work, worker_rebind_fn); | 1462 | INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); |
1337 | /* on creation a worker is in !idle && prep state */ | 1463 | /* on creation a worker is in !idle && prep state */ |
1338 | worker->flags = WORKER_PREP; | 1464 | worker->flags = WORKER_PREP; |
1339 | } | 1465 | } |
@@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void) | |||
1342 | 1468 | ||
1343 | /** | 1469 | /** |
1344 | * create_worker - create a new workqueue worker | 1470 | * create_worker - create a new workqueue worker |
1345 | * @gcwq: gcwq the new worker will belong to | 1471 | * @pool: pool the new worker will belong to |
1346 | * @bind: whether to set affinity to @cpu or not | ||
1347 | * | 1472 | * |
1348 | * Create a new worker which is bound to @gcwq. The returned worker | 1473 | * Create a new worker which is bound to @pool. The returned worker |
1349 | * can be started by calling start_worker() or destroyed using | 1474 | * can be started by calling start_worker() or destroyed using |
1350 | * destroy_worker(). | 1475 | * destroy_worker(). |
1351 | * | 1476 | * |
@@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void) | |||
1355 | * RETURNS: | 1480 | * RETURNS: |
1356 | * Pointer to the newly created worker. | 1481 | * Pointer to the newly created worker. |
1357 | */ | 1482 | */ |
1358 | static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | 1483 | static struct worker *create_worker(struct worker_pool *pool) |
1359 | { | 1484 | { |
1360 | bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; | 1485 | struct global_cwq *gcwq = pool->gcwq; |
1486 | const char *pri = worker_pool_pri(pool) ? "H" : ""; | ||
1361 | struct worker *worker = NULL; | 1487 | struct worker *worker = NULL; |
1362 | int id = -1; | 1488 | int id = -1; |
1363 | 1489 | ||
1364 | spin_lock_irq(&gcwq->lock); | 1490 | spin_lock_irq(&gcwq->lock); |
1365 | while (ida_get_new(&gcwq->worker_ida, &id)) { | 1491 | while (ida_get_new(&pool->worker_ida, &id)) { |
1366 | spin_unlock_irq(&gcwq->lock); | 1492 | spin_unlock_irq(&gcwq->lock); |
1367 | if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) | 1493 | if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) |
1368 | goto fail; | 1494 | goto fail; |
1369 | spin_lock_irq(&gcwq->lock); | 1495 | spin_lock_irq(&gcwq->lock); |
1370 | } | 1496 | } |
@@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | |||
1374 | if (!worker) | 1500 | if (!worker) |
1375 | goto fail; | 1501 | goto fail; |
1376 | 1502 | ||
1377 | worker->gcwq = gcwq; | 1503 | worker->pool = pool; |
1378 | worker->id = id; | 1504 | worker->id = id; |
1379 | 1505 | ||
1380 | if (!on_unbound_cpu) | 1506 | if (gcwq->cpu != WORK_CPU_UNBOUND) |
1381 | worker->task = kthread_create_on_node(worker_thread, | 1507 | worker->task = kthread_create_on_node(worker_thread, |
1382 | worker, | 1508 | worker, cpu_to_node(gcwq->cpu), |
1383 | cpu_to_node(gcwq->cpu), | 1509 | "kworker/%u:%d%s", gcwq->cpu, id, pri); |
1384 | "kworker/%u:%d", gcwq->cpu, id); | ||
1385 | else | 1510 | else |
1386 | worker->task = kthread_create(worker_thread, worker, | 1511 | worker->task = kthread_create(worker_thread, worker, |
1387 | "kworker/u:%d", id); | 1512 | "kworker/u:%d%s", id, pri); |
1388 | if (IS_ERR(worker->task)) | 1513 | if (IS_ERR(worker->task)) |
1389 | goto fail; | 1514 | goto fail; |
1390 | 1515 | ||
1516 | if (worker_pool_pri(pool)) | ||
1517 | set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); | ||
1518 | |||
1391 | /* | 1519 | /* |
1392 | * A rogue worker will become a regular one if CPU comes | 1520 | * Determine CPU binding of the new worker depending on |
1393 | * online later on. Make sure every worker has | 1521 | * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the |
1394 | * PF_THREAD_BOUND set. | 1522 | * flag remains stable across this function. See the comments |
1523 | * above the flag definition for details. | ||
1524 | * | ||
1525 | * As an unbound worker may later become a regular one if CPU comes | ||
1526 | * online, make sure every worker has %PF_THREAD_BOUND set. | ||
1395 | */ | 1527 | */ |
1396 | if (bind && !on_unbound_cpu) | 1528 | if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { |
1397 | kthread_bind(worker->task, gcwq->cpu); | 1529 | kthread_bind(worker->task, gcwq->cpu); |
1398 | else { | 1530 | } else { |
1399 | worker->task->flags |= PF_THREAD_BOUND; | 1531 | worker->task->flags |= PF_THREAD_BOUND; |
1400 | if (on_unbound_cpu) | 1532 | worker->flags |= WORKER_UNBOUND; |
1401 | worker->flags |= WORKER_UNBOUND; | ||
1402 | } | 1533 | } |
1403 | 1534 | ||
1404 | return worker; | 1535 | return worker; |
1405 | fail: | 1536 | fail: |
1406 | if (id >= 0) { | 1537 | if (id >= 0) { |
1407 | spin_lock_irq(&gcwq->lock); | 1538 | spin_lock_irq(&gcwq->lock); |
1408 | ida_remove(&gcwq->worker_ida, id); | 1539 | ida_remove(&pool->worker_ida, id); |
1409 | spin_unlock_irq(&gcwq->lock); | 1540 | spin_unlock_irq(&gcwq->lock); |
1410 | } | 1541 | } |
1411 | kfree(worker); | 1542 | kfree(worker); |
@@ -1424,7 +1555,7 @@ fail: | |||
1424 | static void start_worker(struct worker *worker) | 1555 | static void start_worker(struct worker *worker) |
1425 | { | 1556 | { |
1426 | worker->flags |= WORKER_STARTED; | 1557 | worker->flags |= WORKER_STARTED; |
1427 | worker->gcwq->nr_workers++; | 1558 | worker->pool->nr_workers++; |
1428 | worker_enter_idle(worker); | 1559 | worker_enter_idle(worker); |
1429 | wake_up_process(worker->task); | 1560 | wake_up_process(worker->task); |
1430 | } | 1561 | } |
@@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker) | |||
1440 | */ | 1571 | */ |
1441 | static void destroy_worker(struct worker *worker) | 1572 | static void destroy_worker(struct worker *worker) |
1442 | { | 1573 | { |
1443 | struct global_cwq *gcwq = worker->gcwq; | 1574 | struct worker_pool *pool = worker->pool; |
1575 | struct global_cwq *gcwq = pool->gcwq; | ||
1444 | int id = worker->id; | 1576 | int id = worker->id; |
1445 | 1577 | ||
1446 | /* sanity check frenzy */ | 1578 | /* sanity check frenzy */ |
@@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker) | |||
1448 | BUG_ON(!list_empty(&worker->scheduled)); | 1580 | BUG_ON(!list_empty(&worker->scheduled)); |
1449 | 1581 | ||
1450 | if (worker->flags & WORKER_STARTED) | 1582 | if (worker->flags & WORKER_STARTED) |
1451 | gcwq->nr_workers--; | 1583 | pool->nr_workers--; |
1452 | if (worker->flags & WORKER_IDLE) | 1584 | if (worker->flags & WORKER_IDLE) |
1453 | gcwq->nr_idle--; | 1585 | pool->nr_idle--; |
1454 | 1586 | ||
1455 | list_del_init(&worker->entry); | 1587 | list_del_init(&worker->entry); |
1456 | worker->flags |= WORKER_DIE; | 1588 | worker->flags |= WORKER_DIE; |
@@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker) | |||
1461 | kfree(worker); | 1593 | kfree(worker); |
1462 | 1594 | ||
1463 | spin_lock_irq(&gcwq->lock); | 1595 | spin_lock_irq(&gcwq->lock); |
1464 | ida_remove(&gcwq->worker_ida, id); | 1596 | ida_remove(&pool->worker_ida, id); |
1465 | } | 1597 | } |
1466 | 1598 | ||
1467 | static void idle_worker_timeout(unsigned long __gcwq) | 1599 | static void idle_worker_timeout(unsigned long __pool) |
1468 | { | 1600 | { |
1469 | struct global_cwq *gcwq = (void *)__gcwq; | 1601 | struct worker_pool *pool = (void *)__pool; |
1602 | struct global_cwq *gcwq = pool->gcwq; | ||
1470 | 1603 | ||
1471 | spin_lock_irq(&gcwq->lock); | 1604 | spin_lock_irq(&gcwq->lock); |
1472 | 1605 | ||
1473 | if (too_many_workers(gcwq)) { | 1606 | if (too_many_workers(pool)) { |
1474 | struct worker *worker; | 1607 | struct worker *worker; |
1475 | unsigned long expires; | 1608 | unsigned long expires; |
1476 | 1609 | ||
1477 | /* idle_list is kept in LIFO order, check the last one */ | 1610 | /* idle_list is kept in LIFO order, check the last one */ |
1478 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); | 1611 | worker = list_entry(pool->idle_list.prev, struct worker, entry); |
1479 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | 1612 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
1480 | 1613 | ||
1481 | if (time_before(jiffies, expires)) | 1614 | if (time_before(jiffies, expires)) |
1482 | mod_timer(&gcwq->idle_timer, expires); | 1615 | mod_timer(&pool->idle_timer, expires); |
1483 | else { | 1616 | else { |
1484 | /* it's been idle for too long, wake up manager */ | 1617 | /* it's been idle for too long, wake up manager */ |
1485 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | 1618 | pool->flags |= POOL_MANAGE_WORKERS; |
1486 | wake_up_worker(gcwq); | 1619 | wake_up_worker(pool); |
1487 | } | 1620 | } |
1488 | } | 1621 | } |
1489 | 1622 | ||
@@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work) | |||
1500 | return false; | 1633 | return false; |
1501 | 1634 | ||
1502 | /* mayday mayday mayday */ | 1635 | /* mayday mayday mayday */ |
1503 | cpu = cwq->gcwq->cpu; | 1636 | cpu = cwq->pool->gcwq->cpu; |
1504 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ | 1637 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ |
1505 | if (cpu == WORK_CPU_UNBOUND) | 1638 | if (cpu == WORK_CPU_UNBOUND) |
1506 | cpu = 0; | 1639 | cpu = 0; |
@@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work) | |||
1509 | return true; | 1642 | return true; |
1510 | } | 1643 | } |
1511 | 1644 | ||
1512 | static void gcwq_mayday_timeout(unsigned long __gcwq) | 1645 | static void gcwq_mayday_timeout(unsigned long __pool) |
1513 | { | 1646 | { |
1514 | struct global_cwq *gcwq = (void *)__gcwq; | 1647 | struct worker_pool *pool = (void *)__pool; |
1648 | struct global_cwq *gcwq = pool->gcwq; | ||
1515 | struct work_struct *work; | 1649 | struct work_struct *work; |
1516 | 1650 | ||
1517 | spin_lock_irq(&gcwq->lock); | 1651 | spin_lock_irq(&gcwq->lock); |
1518 | 1652 | ||
1519 | if (need_to_create_worker(gcwq)) { | 1653 | if (need_to_create_worker(pool)) { |
1520 | /* | 1654 | /* |
1521 | * We've been trying to create a new worker but | 1655 | * We've been trying to create a new worker but |
1522 | * haven't been successful. We might be hitting an | 1656 | * haven't been successful. We might be hitting an |
1523 | * allocation deadlock. Send distress signals to | 1657 | * allocation deadlock. Send distress signals to |
1524 | * rescuers. | 1658 | * rescuers. |
1525 | */ | 1659 | */ |
1526 | list_for_each_entry(work, &gcwq->worklist, entry) | 1660 | list_for_each_entry(work, &pool->worklist, entry) |
1527 | send_mayday(work); | 1661 | send_mayday(work); |
1528 | } | 1662 | } |
1529 | 1663 | ||
1530 | spin_unlock_irq(&gcwq->lock); | 1664 | spin_unlock_irq(&gcwq->lock); |
1531 | 1665 | ||
1532 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); | 1666 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); |
1533 | } | 1667 | } |
1534 | 1668 | ||
1535 | /** | 1669 | /** |
1536 | * maybe_create_worker - create a new worker if necessary | 1670 | * maybe_create_worker - create a new worker if necessary |
1537 | * @gcwq: gcwq to create a new worker for | 1671 | * @pool: pool to create a new worker for |
1538 | * | 1672 | * |
1539 | * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to | 1673 | * Create a new worker for @pool if necessary. @pool is guaranteed to |
1540 | * have at least one idle worker on return from this function. If | 1674 | * have at least one idle worker on return from this function. If |
1541 | * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is | 1675 | * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is |
1542 | * sent to all rescuers with works scheduled on @gcwq to resolve | 1676 | * sent to all rescuers with works scheduled on @pool to resolve |
1543 | * possible allocation deadlock. | 1677 | * possible allocation deadlock. |
1544 | * | 1678 | * |
1545 | * On return, need_to_create_worker() is guaranteed to be false and | 1679 | * On return, need_to_create_worker() is guaranteed to be false and |
@@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) | |||
1554 | * false if no action was taken and gcwq->lock stayed locked, true | 1688 | * false if no action was taken and gcwq->lock stayed locked, true |
1555 | * otherwise. | 1689 | * otherwise. |
1556 | */ | 1690 | */ |
1557 | static bool maybe_create_worker(struct global_cwq *gcwq) | 1691 | static bool maybe_create_worker(struct worker_pool *pool) |
1558 | __releases(&gcwq->lock) | 1692 | __releases(&gcwq->lock) |
1559 | __acquires(&gcwq->lock) | 1693 | __acquires(&gcwq->lock) |
1560 | { | 1694 | { |
1561 | if (!need_to_create_worker(gcwq)) | 1695 | struct global_cwq *gcwq = pool->gcwq; |
1696 | |||
1697 | if (!need_to_create_worker(pool)) | ||
1562 | return false; | 1698 | return false; |
1563 | restart: | 1699 | restart: |
1564 | spin_unlock_irq(&gcwq->lock); | 1700 | spin_unlock_irq(&gcwq->lock); |
1565 | 1701 | ||
1566 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ | 1702 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ |
1567 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); | 1703 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); |
1568 | 1704 | ||
1569 | while (true) { | 1705 | while (true) { |
1570 | struct worker *worker; | 1706 | struct worker *worker; |
1571 | 1707 | ||
1572 | worker = create_worker(gcwq, true); | 1708 | worker = create_worker(pool); |
1573 | if (worker) { | 1709 | if (worker) { |
1574 | del_timer_sync(&gcwq->mayday_timer); | 1710 | del_timer_sync(&pool->mayday_timer); |
1575 | spin_lock_irq(&gcwq->lock); | 1711 | spin_lock_irq(&gcwq->lock); |
1576 | start_worker(worker); | 1712 | start_worker(worker); |
1577 | BUG_ON(need_to_create_worker(gcwq)); | 1713 | BUG_ON(need_to_create_worker(pool)); |
1578 | return true; | 1714 | return true; |
1579 | } | 1715 | } |
1580 | 1716 | ||
1581 | if (!need_to_create_worker(gcwq)) | 1717 | if (!need_to_create_worker(pool)) |
1582 | break; | 1718 | break; |
1583 | 1719 | ||
1584 | __set_current_state(TASK_INTERRUPTIBLE); | 1720 | __set_current_state(TASK_INTERRUPTIBLE); |
1585 | schedule_timeout(CREATE_COOLDOWN); | 1721 | schedule_timeout(CREATE_COOLDOWN); |
1586 | 1722 | ||
1587 | if (!need_to_create_worker(gcwq)) | 1723 | if (!need_to_create_worker(pool)) |
1588 | break; | 1724 | break; |
1589 | } | 1725 | } |
1590 | 1726 | ||
1591 | del_timer_sync(&gcwq->mayday_timer); | 1727 | del_timer_sync(&pool->mayday_timer); |
1592 | spin_lock_irq(&gcwq->lock); | 1728 | spin_lock_irq(&gcwq->lock); |
1593 | if (need_to_create_worker(gcwq)) | 1729 | if (need_to_create_worker(pool)) |
1594 | goto restart; | 1730 | goto restart; |
1595 | return true; | 1731 | return true; |
1596 | } | 1732 | } |
1597 | 1733 | ||
1598 | /** | 1734 | /** |
1599 | * maybe_destroy_worker - destroy workers which have been idle for a while | 1735 | * maybe_destroy_worker - destroy workers which have been idle for a while |
1600 | * @gcwq: gcwq to destroy workers for | 1736 | * @pool: pool to destroy workers for |
1601 | * | 1737 | * |
1602 | * Destroy @gcwq workers which have been idle for longer than | 1738 | * Destroy @pool workers which have been idle for longer than |
1603 | * IDLE_WORKER_TIMEOUT. | 1739 | * IDLE_WORKER_TIMEOUT. |
1604 | * | 1740 | * |
1605 | * LOCKING: | 1741 | * LOCKING: |
@@ -1610,19 +1746,19 @@ restart: | |||
1610 | * false if no action was taken and gcwq->lock stayed locked, true | 1746 | * false if no action was taken and gcwq->lock stayed locked, true |
1611 | * otherwise. | 1747 | * otherwise. |
1612 | */ | 1748 | */ |
1613 | static bool maybe_destroy_workers(struct global_cwq *gcwq) | 1749 | static bool maybe_destroy_workers(struct worker_pool *pool) |
1614 | { | 1750 | { |
1615 | bool ret = false; | 1751 | bool ret = false; |
1616 | 1752 | ||
1617 | while (too_many_workers(gcwq)) { | 1753 | while (too_many_workers(pool)) { |
1618 | struct worker *worker; | 1754 | struct worker *worker; |
1619 | unsigned long expires; | 1755 | unsigned long expires; |
1620 | 1756 | ||
1621 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); | 1757 | worker = list_entry(pool->idle_list.prev, struct worker, entry); |
1622 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | 1758 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
1623 | 1759 | ||
1624 | if (time_before(jiffies, expires)) { | 1760 | if (time_before(jiffies, expires)) { |
1625 | mod_timer(&gcwq->idle_timer, expires); | 1761 | mod_timer(&pool->idle_timer, expires); |
1626 | break; | 1762 | break; |
1627 | } | 1763 | } |
1628 | 1764 | ||
@@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq) | |||
1655 | */ | 1791 | */ |
1656 | static bool manage_workers(struct worker *worker) | 1792 | static bool manage_workers(struct worker *worker) |
1657 | { | 1793 | { |
1658 | struct global_cwq *gcwq = worker->gcwq; | 1794 | struct worker_pool *pool = worker->pool; |
1659 | bool ret = false; | 1795 | bool ret = false; |
1660 | 1796 | ||
1661 | if (gcwq->flags & GCWQ_MANAGING_WORKERS) | 1797 | if (!mutex_trylock(&pool->manager_mutex)) |
1662 | return ret; | 1798 | return ret; |
1663 | 1799 | ||
1664 | gcwq->flags &= ~GCWQ_MANAGE_WORKERS; | 1800 | pool->flags &= ~POOL_MANAGE_WORKERS; |
1665 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
1666 | 1801 | ||
1667 | /* | 1802 | /* |
1668 | * Destroy and then create so that may_start_working() is true | 1803 | * Destroy and then create so that may_start_working() is true |
1669 | * on return. | 1804 | * on return. |
1670 | */ | 1805 | */ |
1671 | ret |= maybe_destroy_workers(gcwq); | 1806 | ret |= maybe_destroy_workers(pool); |
1672 | ret |= maybe_create_worker(gcwq); | 1807 | ret |= maybe_create_worker(pool); |
1673 | |||
1674 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
1675 | |||
1676 | /* | ||
1677 | * The trustee might be waiting to take over the manager | ||
1678 | * position, tell it we're done. | ||
1679 | */ | ||
1680 | if (unlikely(gcwq->trustee)) | ||
1681 | wake_up_all(&gcwq->trustee_wait); | ||
1682 | 1808 | ||
1809 | mutex_unlock(&pool->manager_mutex); | ||
1683 | return ret; | 1810 | return ret; |
1684 | } | 1811 | } |
1685 | 1812 | ||
@@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | |||
1728 | { | 1855 | { |
1729 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | 1856 | struct work_struct *work = list_first_entry(&cwq->delayed_works, |
1730 | struct work_struct, entry); | 1857 | struct work_struct, entry); |
1731 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); | ||
1732 | 1858 | ||
1733 | trace_workqueue_activate_work(work); | 1859 | trace_workqueue_activate_work(work); |
1734 | move_linked_works(work, pos, NULL); | 1860 | move_linked_works(work, &cwq->pool->worklist, NULL); |
1735 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | 1861 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); |
1736 | cwq->nr_active++; | 1862 | cwq->nr_active++; |
1737 | } | 1863 | } |
@@ -1804,7 +1930,8 @@ __releases(&gcwq->lock) | |||
1804 | __acquires(&gcwq->lock) | 1930 | __acquires(&gcwq->lock) |
1805 | { | 1931 | { |
1806 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | 1932 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); |
1807 | struct global_cwq *gcwq = cwq->gcwq; | 1933 | struct worker_pool *pool = worker->pool; |
1934 | struct global_cwq *gcwq = pool->gcwq; | ||
1808 | struct hlist_head *bwh = busy_worker_head(gcwq, work); | 1935 | struct hlist_head *bwh = busy_worker_head(gcwq, work); |
1809 | bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; | 1936 | bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; |
1810 | work_func_t f = work->func; | 1937 | work_func_t f = work->func; |
@@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock) | |||
1823 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); | 1950 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); |
1824 | #endif | 1951 | #endif |
1825 | /* | 1952 | /* |
1953 | * Ensure we're on the correct CPU. DISASSOCIATED test is | ||
1954 | * necessary to avoid spurious warnings from rescuers servicing the | ||
1955 | * unbound or a disassociated gcwq. | ||
1956 | */ | ||
1957 | WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && | ||
1958 | !(gcwq->flags & GCWQ_DISASSOCIATED) && | ||
1959 | raw_smp_processor_id() != gcwq->cpu); | ||
1960 | |||
1961 | /* | ||
1826 | * A single work shouldn't be executed concurrently by | 1962 | * A single work shouldn't be executed concurrently by |
1827 | * multiple workers on a single cpu. Check whether anyone is | 1963 | * multiple workers on a single cpu. Check whether anyone is |
1828 | * already processing the work. If so, defer the work to the | 1964 | * already processing the work. If so, defer the work to the |
@@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock) | |||
1846 | list_del_init(&work->entry); | 1982 | list_del_init(&work->entry); |
1847 | 1983 | ||
1848 | /* | 1984 | /* |
1849 | * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, | ||
1850 | * wake up another worker; otherwise, clear HIGHPRI_PENDING. | ||
1851 | */ | ||
1852 | if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { | ||
1853 | struct work_struct *nwork = list_first_entry(&gcwq->worklist, | ||
1854 | struct work_struct, entry); | ||
1855 | |||
1856 | if (!list_empty(&gcwq->worklist) && | ||
1857 | get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) | ||
1858 | wake_up_worker(gcwq); | ||
1859 | else | ||
1860 | gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; | ||
1861 | } | ||
1862 | |||
1863 | /* | ||
1864 | * CPU intensive works don't participate in concurrency | 1985 | * CPU intensive works don't participate in concurrency |
1865 | * management. They're the scheduler's responsibility. | 1986 | * management. They're the scheduler's responsibility. |
1866 | */ | 1987 | */ |
1867 | if (unlikely(cpu_intensive)) | 1988 | if (unlikely(cpu_intensive)) |
1868 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); | 1989 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); |
1869 | 1990 | ||
1991 | /* | ||
1992 | * Unbound gcwq isn't concurrency managed and work items should be | ||
1993 | * executed ASAP. Wake up another worker if necessary. | ||
1994 | */ | ||
1995 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) | ||
1996 | wake_up_worker(pool); | ||
1997 | |||
1870 | spin_unlock_irq(&gcwq->lock); | 1998 | spin_unlock_irq(&gcwq->lock); |
1871 | 1999 | ||
1872 | work_clear_pending(work); | 2000 | work_clear_pending(work); |
@@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker) | |||
1939 | static int worker_thread(void *__worker) | 2067 | static int worker_thread(void *__worker) |
1940 | { | 2068 | { |
1941 | struct worker *worker = __worker; | 2069 | struct worker *worker = __worker; |
1942 | struct global_cwq *gcwq = worker->gcwq; | 2070 | struct worker_pool *pool = worker->pool; |
2071 | struct global_cwq *gcwq = pool->gcwq; | ||
1943 | 2072 | ||
1944 | /* tell the scheduler that this is a workqueue worker */ | 2073 | /* tell the scheduler that this is a workqueue worker */ |
1945 | worker->task->flags |= PF_WQ_WORKER; | 2074 | worker->task->flags |= PF_WQ_WORKER; |
1946 | woke_up: | 2075 | woke_up: |
1947 | spin_lock_irq(&gcwq->lock); | 2076 | spin_lock_irq(&gcwq->lock); |
1948 | 2077 | ||
1949 | /* DIE can be set only while we're idle, checking here is enough */ | 2078 | /* |
1950 | if (worker->flags & WORKER_DIE) { | 2079 | * DIE can be set only while idle and REBIND set while busy has |
2080 | * @worker->rebind_work scheduled. Checking here is enough. | ||
2081 | */ | ||
2082 | if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { | ||
1951 | spin_unlock_irq(&gcwq->lock); | 2083 | spin_unlock_irq(&gcwq->lock); |
1952 | worker->task->flags &= ~PF_WQ_WORKER; | 2084 | |
1953 | return 0; | 2085 | if (worker->flags & WORKER_DIE) { |
2086 | worker->task->flags &= ~PF_WQ_WORKER; | ||
2087 | return 0; | ||
2088 | } | ||
2089 | |||
2090 | idle_worker_rebind(worker); | ||
2091 | goto woke_up; | ||
1954 | } | 2092 | } |
1955 | 2093 | ||
1956 | worker_leave_idle(worker); | 2094 | worker_leave_idle(worker); |
1957 | recheck: | 2095 | recheck: |
1958 | /* no more worker necessary? */ | 2096 | /* no more worker necessary? */ |
1959 | if (!need_more_worker(gcwq)) | 2097 | if (!need_more_worker(pool)) |
1960 | goto sleep; | 2098 | goto sleep; |
1961 | 2099 | ||
1962 | /* do we need to manage? */ | 2100 | /* do we need to manage? */ |
1963 | if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) | 2101 | if (unlikely(!may_start_working(pool)) && manage_workers(worker)) |
1964 | goto recheck; | 2102 | goto recheck; |
1965 | 2103 | ||
1966 | /* | 2104 | /* |
@@ -1979,7 +2117,7 @@ recheck: | |||
1979 | 2117 | ||
1980 | do { | 2118 | do { |
1981 | struct work_struct *work = | 2119 | struct work_struct *work = |
1982 | list_first_entry(&gcwq->worklist, | 2120 | list_first_entry(&pool->worklist, |
1983 | struct work_struct, entry); | 2121 | struct work_struct, entry); |
1984 | 2122 | ||
1985 | if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { | 2123 | if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { |
@@ -1991,11 +2129,11 @@ recheck: | |||
1991 | move_linked_works(work, &worker->scheduled, NULL); | 2129 | move_linked_works(work, &worker->scheduled, NULL); |
1992 | process_scheduled_works(worker); | 2130 | process_scheduled_works(worker); |
1993 | } | 2131 | } |
1994 | } while (keep_working(gcwq)); | 2132 | } while (keep_working(pool)); |
1995 | 2133 | ||
1996 | worker_set_flags(worker, WORKER_PREP, false); | 2134 | worker_set_flags(worker, WORKER_PREP, false); |
1997 | sleep: | 2135 | sleep: |
1998 | if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) | 2136 | if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) |
1999 | goto recheck; | 2137 | goto recheck; |
2000 | 2138 | ||
2001 | /* | 2139 | /* |
@@ -2053,14 +2191,15 @@ repeat: | |||
2053 | for_each_mayday_cpu(cpu, wq->mayday_mask) { | 2191 | for_each_mayday_cpu(cpu, wq->mayday_mask) { |
2054 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; | 2192 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; |
2055 | struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); | 2193 | struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); |
2056 | struct global_cwq *gcwq = cwq->gcwq; | 2194 | struct worker_pool *pool = cwq->pool; |
2195 | struct global_cwq *gcwq = pool->gcwq; | ||
2057 | struct work_struct *work, *n; | 2196 | struct work_struct *work, *n; |
2058 | 2197 | ||
2059 | __set_current_state(TASK_RUNNING); | 2198 | __set_current_state(TASK_RUNNING); |
2060 | mayday_clear_cpu(cpu, wq->mayday_mask); | 2199 | mayday_clear_cpu(cpu, wq->mayday_mask); |
2061 | 2200 | ||
2062 | /* migrate to the target cpu if possible */ | 2201 | /* migrate to the target cpu if possible */ |
2063 | rescuer->gcwq = gcwq; | 2202 | rescuer->pool = pool; |
2064 | worker_maybe_bind_and_lock(rescuer); | 2203 | worker_maybe_bind_and_lock(rescuer); |
2065 | 2204 | ||
2066 | /* | 2205 | /* |
@@ -2068,7 +2207,7 @@ repeat: | |||
2068 | * process'em. | 2207 | * process'em. |
2069 | */ | 2208 | */ |
2070 | BUG_ON(!list_empty(&rescuer->scheduled)); | 2209 | BUG_ON(!list_empty(&rescuer->scheduled)); |
2071 | list_for_each_entry_safe(work, n, &gcwq->worklist, entry) | 2210 | list_for_each_entry_safe(work, n, &pool->worklist, entry) |
2072 | if (get_work_cwq(work) == cwq) | 2211 | if (get_work_cwq(work) == cwq) |
2073 | move_linked_works(work, scheduled, &n); | 2212 | move_linked_works(work, scheduled, &n); |
2074 | 2213 | ||
@@ -2079,8 +2218,8 @@ repeat: | |||
2079 | * regular worker; otherwise, we end up with 0 concurrency | 2218 | * regular worker; otherwise, we end up with 0 concurrency |
2080 | * and stalling the execution. | 2219 | * and stalling the execution. |
2081 | */ | 2220 | */ |
2082 | if (keep_working(gcwq)) | 2221 | if (keep_working(pool)) |
2083 | wake_up_worker(gcwq); | 2222 | wake_up_worker(pool); |
2084 | 2223 | ||
2085 | spin_unlock_irq(&gcwq->lock); | 2224 | spin_unlock_irq(&gcwq->lock); |
2086 | } | 2225 | } |
@@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, | |||
2205 | 2344 | ||
2206 | for_each_cwq_cpu(cpu, wq) { | 2345 | for_each_cwq_cpu(cpu, wq) { |
2207 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 2346 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
2208 | struct global_cwq *gcwq = cwq->gcwq; | 2347 | struct global_cwq *gcwq = cwq->pool->gcwq; |
2209 | 2348 | ||
2210 | spin_lock_irq(&gcwq->lock); | 2349 | spin_lock_irq(&gcwq->lock); |
2211 | 2350 | ||
@@ -2421,9 +2560,9 @@ reflush: | |||
2421 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 2560 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
2422 | bool drained; | 2561 | bool drained; |
2423 | 2562 | ||
2424 | spin_lock_irq(&cwq->gcwq->lock); | 2563 | spin_lock_irq(&cwq->pool->gcwq->lock); |
2425 | drained = !cwq->nr_active && list_empty(&cwq->delayed_works); | 2564 | drained = !cwq->nr_active && list_empty(&cwq->delayed_works); |
2426 | spin_unlock_irq(&cwq->gcwq->lock); | 2565 | spin_unlock_irq(&cwq->pool->gcwq->lock); |
2427 | 2566 | ||
2428 | if (drained) | 2567 | if (drained) |
2429 | continue; | 2568 | continue; |
@@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | |||
2463 | */ | 2602 | */ |
2464 | smp_rmb(); | 2603 | smp_rmb(); |
2465 | cwq = get_work_cwq(work); | 2604 | cwq = get_work_cwq(work); |
2466 | if (unlikely(!cwq || gcwq != cwq->gcwq)) | 2605 | if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) |
2467 | goto already_gone; | 2606 | goto already_gone; |
2468 | } else if (wait_executing) { | 2607 | } else if (wait_executing) { |
2469 | worker = find_worker_executing_work(gcwq, work); | 2608 | worker = find_worker_executing_work(gcwq, work); |
@@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
2984 | if (flags & WQ_MEM_RECLAIM) | 3123 | if (flags & WQ_MEM_RECLAIM) |
2985 | flags |= WQ_RESCUER; | 3124 | flags |= WQ_RESCUER; |
2986 | 3125 | ||
2987 | /* | ||
2988 | * Unbound workqueues aren't concurrency managed and should be | ||
2989 | * dispatched to workers immediately. | ||
2990 | */ | ||
2991 | if (flags & WQ_UNBOUND) | ||
2992 | flags |= WQ_HIGHPRI; | ||
2993 | |||
2994 | max_active = max_active ?: WQ_DFL_ACTIVE; | 3126 | max_active = max_active ?: WQ_DFL_ACTIVE; |
2995 | max_active = wq_clamp_max_active(max_active, flags, wq->name); | 3127 | max_active = wq_clamp_max_active(max_active, flags, wq->name); |
2996 | 3128 | ||
@@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
3011 | for_each_cwq_cpu(cpu, wq) { | 3143 | for_each_cwq_cpu(cpu, wq) { |
3012 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3144 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3013 | struct global_cwq *gcwq = get_gcwq(cpu); | 3145 | struct global_cwq *gcwq = get_gcwq(cpu); |
3146 | int pool_idx = (bool)(flags & WQ_HIGHPRI); | ||
3014 | 3147 | ||
3015 | BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); | 3148 | BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); |
3016 | cwq->gcwq = gcwq; | 3149 | cwq->pool = &gcwq->pools[pool_idx]; |
3017 | cwq->wq = wq; | 3150 | cwq->wq = wq; |
3018 | cwq->flush_color = -1; | 3151 | cwq->flush_color = -1; |
3019 | cwq->max_active = max_active; | 3152 | cwq->max_active = max_active; |
@@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy); | |||
3225 | * gcwqs serve mix of short, long and very long running works making | 3358 | * gcwqs serve mix of short, long and very long running works making |
3226 | * blocked draining impractical. | 3359 | * blocked draining impractical. |
3227 | * | 3360 | * |
3228 | * This is solved by allowing a gcwq to be detached from CPU, running | 3361 | * This is solved by allowing a gcwq to be disassociated from the CPU |
3229 | * it with unbound (rogue) workers and allowing it to be reattached | 3362 | * running as an unbound one and allowing it to be reattached later if the |
3230 | * later if the cpu comes back online. A separate thread is created | 3363 | * cpu comes back online. |
3231 | * to govern a gcwq in such state and is called the trustee of the | ||
3232 | * gcwq. | ||
3233 | * | ||
3234 | * Trustee states and their descriptions. | ||
3235 | * | ||
3236 | * START Command state used on startup. On CPU_DOWN_PREPARE, a | ||
3237 | * new trustee is started with this state. | ||
3238 | * | ||
3239 | * IN_CHARGE Once started, trustee will enter this state after | ||
3240 | * assuming the manager role and making all existing | ||
3241 | * workers rogue. DOWN_PREPARE waits for trustee to | ||
3242 | * enter this state. After reaching IN_CHARGE, trustee | ||
3243 | * tries to execute the pending worklist until it's empty | ||
3244 | * and the state is set to BUTCHER, or the state is set | ||
3245 | * to RELEASE. | ||
3246 | * | ||
3247 | * BUTCHER Command state which is set by the cpu callback after | ||
3248 | * the cpu has went down. Once this state is set trustee | ||
3249 | * knows that there will be no new works on the worklist | ||
3250 | * and once the worklist is empty it can proceed to | ||
3251 | * killing idle workers. | ||
3252 | * | ||
3253 | * RELEASE Command state which is set by the cpu callback if the | ||
3254 | * cpu down has been canceled or it has come online | ||
3255 | * again. After recognizing this state, trustee stops | ||
3256 | * trying to drain or butcher and clears ROGUE, rebinds | ||
3257 | * all remaining workers back to the cpu and releases | ||
3258 | * manager role. | ||
3259 | * | ||
3260 | * DONE Trustee will enter this state after BUTCHER or RELEASE | ||
3261 | * is complete. | ||
3262 | * | ||
3263 | * trustee CPU draining | ||
3264 | * took over down complete | ||
3265 | * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE | ||
3266 | * | | ^ | ||
3267 | * | CPU is back online v return workers | | ||
3268 | * ----------------> RELEASE -------------- | ||
3269 | */ | 3364 | */ |
3270 | 3365 | ||
3271 | /** | 3366 | /* claim manager positions of all pools */ |
3272 | * trustee_wait_event_timeout - timed event wait for trustee | 3367 | static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) |
3273 | * @cond: condition to wait for | ||
3274 | * @timeout: timeout in jiffies | ||
3275 | * | ||
3276 | * wait_event_timeout() for trustee to use. Handles locking and | ||
3277 | * checks for RELEASE request. | ||
3278 | * | ||
3279 | * CONTEXT: | ||
3280 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3281 | * multiple times. To be used by trustee. | ||
3282 | * | ||
3283 | * RETURNS: | ||
3284 | * Positive indicating left time if @cond is satisfied, 0 if timed | ||
3285 | * out, -1 if canceled. | ||
3286 | */ | ||
3287 | #define trustee_wait_event_timeout(cond, timeout) ({ \ | ||
3288 | long __ret = (timeout); \ | ||
3289 | while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ | ||
3290 | __ret) { \ | ||
3291 | spin_unlock_irq(&gcwq->lock); \ | ||
3292 | __wait_event_timeout(gcwq->trustee_wait, (cond) || \ | ||
3293 | (gcwq->trustee_state == TRUSTEE_RELEASE), \ | ||
3294 | __ret); \ | ||
3295 | spin_lock_irq(&gcwq->lock); \ | ||
3296 | } \ | ||
3297 | gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ | ||
3298 | }) | ||
3299 | |||
3300 | /** | ||
3301 | * trustee_wait_event - event wait for trustee | ||
3302 | * @cond: condition to wait for | ||
3303 | * | ||
3304 | * wait_event() for trustee to use. Automatically handles locking and | ||
3305 | * checks for CANCEL request. | ||
3306 | * | ||
3307 | * CONTEXT: | ||
3308 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3309 | * multiple times. To be used by trustee. | ||
3310 | * | ||
3311 | * RETURNS: | ||
3312 | * 0 if @cond is satisfied, -1 if canceled. | ||
3313 | */ | ||
3314 | #define trustee_wait_event(cond) ({ \ | ||
3315 | long __ret1; \ | ||
3316 | __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ | ||
3317 | __ret1 < 0 ? -1 : 0; \ | ||
3318 | }) | ||
3319 | |||
3320 | static int __cpuinit trustee_thread(void *__gcwq) | ||
3321 | { | 3368 | { |
3322 | struct global_cwq *gcwq = __gcwq; | 3369 | struct worker_pool *pool; |
3323 | struct worker *worker; | ||
3324 | struct work_struct *work; | ||
3325 | struct hlist_node *pos; | ||
3326 | long rc; | ||
3327 | int i; | ||
3328 | |||
3329 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
3330 | 3370 | ||
3371 | for_each_worker_pool(pool, gcwq) | ||
3372 | mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); | ||
3331 | spin_lock_irq(&gcwq->lock); | 3373 | spin_lock_irq(&gcwq->lock); |
3332 | /* | 3374 | } |
3333 | * Claim the manager position and make all workers rogue. | ||
3334 | * Trustee must be bound to the target cpu and can't be | ||
3335 | * cancelled. | ||
3336 | */ | ||
3337 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
3338 | rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); | ||
3339 | BUG_ON(rc < 0); | ||
3340 | |||
3341 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
3342 | |||
3343 | list_for_each_entry(worker, &gcwq->idle_list, entry) | ||
3344 | worker->flags |= WORKER_ROGUE; | ||
3345 | 3375 | ||
3346 | for_each_busy_worker(worker, i, pos, gcwq) | 3376 | /* release manager positions */ |
3347 | worker->flags |= WORKER_ROGUE; | 3377 | static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) |
3378 | { | ||
3379 | struct worker_pool *pool; | ||
3348 | 3380 | ||
3349 | /* | ||
3350 | * Call schedule() so that we cross rq->lock and thus can | ||
3351 | * guarantee sched callbacks see the rogue flag. This is | ||
3352 | * necessary as scheduler callbacks may be invoked from other | ||
3353 | * cpus. | ||
3354 | */ | ||
3355 | spin_unlock_irq(&gcwq->lock); | 3381 | spin_unlock_irq(&gcwq->lock); |
3356 | schedule(); | 3382 | for_each_worker_pool(pool, gcwq) |
3357 | spin_lock_irq(&gcwq->lock); | 3383 | mutex_unlock(&pool->manager_mutex); |
3384 | } | ||
3358 | 3385 | ||
3359 | /* | 3386 | static void gcwq_unbind_fn(struct work_struct *work) |
3360 | * Sched callbacks are disabled now. Zap nr_running. After | 3387 | { |
3361 | * this, nr_running stays zero and need_more_worker() and | 3388 | struct global_cwq *gcwq = get_gcwq(smp_processor_id()); |
3362 | * keep_working() are always true as long as the worklist is | 3389 | struct worker_pool *pool; |
3363 | * not empty. | 3390 | struct worker *worker; |
3364 | */ | 3391 | struct hlist_node *pos; |
3365 | atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); | 3392 | int i; |
3366 | 3393 | ||
3367 | spin_unlock_irq(&gcwq->lock); | 3394 | BUG_ON(gcwq->cpu != smp_processor_id()); |
3368 | del_timer_sync(&gcwq->idle_timer); | ||
3369 | spin_lock_irq(&gcwq->lock); | ||
3370 | 3395 | ||
3371 | /* | 3396 | gcwq_claim_management_and_lock(gcwq); |
3372 | * We're now in charge. Notify and proceed to drain. We need | ||
3373 | * to keep the gcwq running during the whole CPU down | ||
3374 | * procedure as other cpu hotunplug callbacks may need to | ||
3375 | * flush currently running tasks. | ||
3376 | */ | ||
3377 | gcwq->trustee_state = TRUSTEE_IN_CHARGE; | ||
3378 | wake_up_all(&gcwq->trustee_wait); | ||
3379 | 3397 | ||
3380 | /* | 3398 | /* |
3381 | * The original cpu is in the process of dying and may go away | 3399 | * We've claimed all manager positions. Make all workers unbound |
3382 | * anytime now. When that happens, we and all workers would | 3400 | * and set DISASSOCIATED. Before this, all workers except for the |
3383 | * be migrated to other cpus. Try draining any left work. We | 3401 | * ones which are still executing works from before the last CPU |
3384 | * want to get it over with ASAP - spam rescuers, wake up as | 3402 | * down must be on the cpu. After this, they may become diasporas. |
3385 | * many idlers as necessary and create new ones till the | ||
3386 | * worklist is empty. Note that if the gcwq is frozen, there | ||
3387 | * may be frozen works in freezable cwqs. Don't declare | ||
3388 | * completion while frozen. | ||
3389 | */ | 3403 | */ |
3390 | while (gcwq->nr_workers != gcwq->nr_idle || | 3404 | for_each_worker_pool(pool, gcwq) |
3391 | gcwq->flags & GCWQ_FREEZING || | 3405 | list_for_each_entry(worker, &pool->idle_list, entry) |
3392 | gcwq->trustee_state == TRUSTEE_IN_CHARGE) { | 3406 | worker->flags |= WORKER_UNBOUND; |
3393 | int nr_works = 0; | ||
3394 | |||
3395 | list_for_each_entry(work, &gcwq->worklist, entry) { | ||
3396 | send_mayday(work); | ||
3397 | nr_works++; | ||
3398 | } | ||
3399 | 3407 | ||
3400 | list_for_each_entry(worker, &gcwq->idle_list, entry) { | 3408 | for_each_busy_worker(worker, i, pos, gcwq) |
3401 | if (!nr_works--) | 3409 | worker->flags |= WORKER_UNBOUND; |
3402 | break; | ||
3403 | wake_up_process(worker->task); | ||
3404 | } | ||
3405 | 3410 | ||
3406 | if (need_to_create_worker(gcwq)) { | 3411 | gcwq->flags |= GCWQ_DISASSOCIATED; |
3407 | spin_unlock_irq(&gcwq->lock); | ||
3408 | worker = create_worker(gcwq, false); | ||
3409 | spin_lock_irq(&gcwq->lock); | ||
3410 | if (worker) { | ||
3411 | worker->flags |= WORKER_ROGUE; | ||
3412 | start_worker(worker); | ||
3413 | } | ||
3414 | } | ||
3415 | 3412 | ||
3416 | /* give a breather */ | 3413 | gcwq_release_management_and_unlock(gcwq); |
3417 | if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) | ||
3418 | break; | ||
3419 | } | ||
3420 | 3414 | ||
3421 | /* | 3415 | /* |
3422 | * Either all works have been scheduled and cpu is down, or | 3416 | * Call schedule() so that we cross rq->lock and thus can guarantee |
3423 | * cpu down has already been canceled. Wait for and butcher | 3417 | * sched callbacks see the %WORKER_UNBOUND flag. This is necessary |
3424 | * all workers till we're canceled. | 3418 | * as scheduler callbacks may be invoked from other cpus. |
3425 | */ | 3419 | */ |
3426 | do { | 3420 | schedule(); |
3427 | rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); | ||
3428 | while (!list_empty(&gcwq->idle_list)) | ||
3429 | destroy_worker(list_first_entry(&gcwq->idle_list, | ||
3430 | struct worker, entry)); | ||
3431 | } while (gcwq->nr_workers && rc >= 0); | ||
3432 | 3421 | ||
3433 | /* | 3422 | /* |
3434 | * At this point, either draining has completed and no worker | 3423 | * Sched callbacks are disabled now. Zap nr_running. After this, |
3435 | * is left, or cpu down has been canceled or the cpu is being | 3424 | * nr_running stays zero and need_more_worker() and keep_working() |
3436 | * brought back up. There shouldn't be any idle one left. | 3425 | * are always true as long as the worklist is not empty. @gcwq now |
3437 | * Tell the remaining busy ones to rebind once it finishes the | 3426 | * behaves as unbound (in terms of concurrency management) gcwq |
3438 | * currently scheduled works by scheduling the rebind_work. | 3427 | * which is served by workers tied to the CPU. |
3428 | * | ||
3429 | * On return from this function, the current worker would trigger | ||
3430 | * unbound chain execution of pending work items if other workers | ||
3431 | * didn't already. | ||
3439 | */ | 3432 | */ |
3440 | WARN_ON(!list_empty(&gcwq->idle_list)); | 3433 | for_each_worker_pool(pool, gcwq) |
3441 | 3434 | atomic_set(get_pool_nr_running(pool), 0); | |
3442 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
3443 | struct work_struct *rebind_work = &worker->rebind_work; | ||
3444 | |||
3445 | /* | ||
3446 | * Rebind_work may race with future cpu hotplug | ||
3447 | * operations. Use a separate flag to mark that | ||
3448 | * rebinding is scheduled. | ||
3449 | */ | ||
3450 | worker->flags |= WORKER_REBIND; | ||
3451 | worker->flags &= ~WORKER_ROGUE; | ||
3452 | |||
3453 | /* queue rebind_work, wq doesn't matter, use the default one */ | ||
3454 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | ||
3455 | work_data_bits(rebind_work))) | ||
3456 | continue; | ||
3457 | |||
3458 | debug_work_activate(rebind_work); | ||
3459 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
3460 | worker->scheduled.next, | ||
3461 | work_color_to_flags(WORK_NO_COLOR)); | ||
3462 | } | ||
3463 | |||
3464 | /* relinquish manager role */ | ||
3465 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
3466 | |||
3467 | /* notify completion */ | ||
3468 | gcwq->trustee = NULL; | ||
3469 | gcwq->trustee_state = TRUSTEE_DONE; | ||
3470 | wake_up_all(&gcwq->trustee_wait); | ||
3471 | spin_unlock_irq(&gcwq->lock); | ||
3472 | return 0; | ||
3473 | } | 3435 | } |
3474 | 3436 | ||
3475 | /** | 3437 | /* |
3476 | * wait_trustee_state - wait for trustee to enter the specified state | 3438 | * Workqueues should be brought up before normal priority CPU notifiers. |
3477 | * @gcwq: gcwq the trustee of interest belongs to | 3439 | * This will be registered high priority CPU notifier. |
3478 | * @state: target state to wait for | ||
3479 | * | ||
3480 | * Wait for the trustee to reach @state. DONE is already matched. | ||
3481 | * | ||
3482 | * CONTEXT: | ||
3483 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3484 | * multiple times. To be used by cpu_callback. | ||
3485 | */ | 3440 | */ |
3486 | static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) | 3441 | static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, |
3487 | __releases(&gcwq->lock) | 3442 | unsigned long action, |
3488 | __acquires(&gcwq->lock) | 3443 | void *hcpu) |
3489 | { | ||
3490 | if (!(gcwq->trustee_state == state || | ||
3491 | gcwq->trustee_state == TRUSTEE_DONE)) { | ||
3492 | spin_unlock_irq(&gcwq->lock); | ||
3493 | __wait_event(gcwq->trustee_wait, | ||
3494 | gcwq->trustee_state == state || | ||
3495 | gcwq->trustee_state == TRUSTEE_DONE); | ||
3496 | spin_lock_irq(&gcwq->lock); | ||
3497 | } | ||
3498 | } | ||
3499 | |||
3500 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | ||
3501 | unsigned long action, | ||
3502 | void *hcpu) | ||
3503 | { | 3444 | { |
3504 | unsigned int cpu = (unsigned long)hcpu; | 3445 | unsigned int cpu = (unsigned long)hcpu; |
3505 | struct global_cwq *gcwq = get_gcwq(cpu); | 3446 | struct global_cwq *gcwq = get_gcwq(cpu); |
3506 | struct task_struct *new_trustee = NULL; | 3447 | struct worker_pool *pool; |
3507 | struct worker *uninitialized_var(new_worker); | ||
3508 | unsigned long flags; | ||
3509 | |||
3510 | action &= ~CPU_TASKS_FROZEN; | ||
3511 | 3448 | ||
3512 | switch (action) { | 3449 | switch (action & ~CPU_TASKS_FROZEN) { |
3513 | case CPU_DOWN_PREPARE: | ||
3514 | new_trustee = kthread_create(trustee_thread, gcwq, | ||
3515 | "workqueue_trustee/%d\n", cpu); | ||
3516 | if (IS_ERR(new_trustee)) | ||
3517 | return notifier_from_errno(PTR_ERR(new_trustee)); | ||
3518 | kthread_bind(new_trustee, cpu); | ||
3519 | /* fall through */ | ||
3520 | case CPU_UP_PREPARE: | 3450 | case CPU_UP_PREPARE: |
3521 | BUG_ON(gcwq->first_idle); | 3451 | for_each_worker_pool(pool, gcwq) { |
3522 | new_worker = create_worker(gcwq, false); | 3452 | struct worker *worker; |
3523 | if (!new_worker) { | ||
3524 | if (new_trustee) | ||
3525 | kthread_stop(new_trustee); | ||
3526 | return NOTIFY_BAD; | ||
3527 | } | ||
3528 | } | ||
3529 | |||
3530 | /* some are called w/ irq disabled, don't disturb irq status */ | ||
3531 | spin_lock_irqsave(&gcwq->lock, flags); | ||
3532 | 3453 | ||
3533 | switch (action) { | 3454 | if (pool->nr_workers) |
3534 | case CPU_DOWN_PREPARE: | 3455 | continue; |
3535 | /* initialize trustee and tell it to acquire the gcwq */ | ||
3536 | BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); | ||
3537 | gcwq->trustee = new_trustee; | ||
3538 | gcwq->trustee_state = TRUSTEE_START; | ||
3539 | wake_up_process(gcwq->trustee); | ||
3540 | wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); | ||
3541 | /* fall through */ | ||
3542 | case CPU_UP_PREPARE: | ||
3543 | BUG_ON(gcwq->first_idle); | ||
3544 | gcwq->first_idle = new_worker; | ||
3545 | break; | ||
3546 | 3456 | ||
3547 | case CPU_DYING: | 3457 | worker = create_worker(pool); |
3548 | /* | 3458 | if (!worker) |
3549 | * Before this, the trustee and all workers except for | 3459 | return NOTIFY_BAD; |
3550 | * the ones which are still executing works from | ||
3551 | * before the last CPU down must be on the cpu. After | ||
3552 | * this, they'll all be diasporas. | ||
3553 | */ | ||
3554 | gcwq->flags |= GCWQ_DISASSOCIATED; | ||
3555 | break; | ||
3556 | 3460 | ||
3557 | case CPU_POST_DEAD: | 3461 | spin_lock_irq(&gcwq->lock); |
3558 | gcwq->trustee_state = TRUSTEE_BUTCHER; | 3462 | start_worker(worker); |
3559 | /* fall through */ | 3463 | spin_unlock_irq(&gcwq->lock); |
3560 | case CPU_UP_CANCELED: | 3464 | } |
3561 | destroy_worker(gcwq->first_idle); | ||
3562 | gcwq->first_idle = NULL; | ||
3563 | break; | 3465 | break; |
3564 | 3466 | ||
3565 | case CPU_DOWN_FAILED: | 3467 | case CPU_DOWN_FAILED: |
3566 | case CPU_ONLINE: | 3468 | case CPU_ONLINE: |
3469 | gcwq_claim_management_and_lock(gcwq); | ||
3567 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | 3470 | gcwq->flags &= ~GCWQ_DISASSOCIATED; |
3568 | if (gcwq->trustee_state != TRUSTEE_DONE) { | 3471 | rebind_workers(gcwq); |
3569 | gcwq->trustee_state = TRUSTEE_RELEASE; | 3472 | gcwq_release_management_and_unlock(gcwq); |
3570 | wake_up_process(gcwq->trustee); | ||
3571 | wait_trustee_state(gcwq, TRUSTEE_DONE); | ||
3572 | } | ||
3573 | |||
3574 | /* | ||
3575 | * Trustee is done and there might be no worker left. | ||
3576 | * Put the first_idle in and request a real manager to | ||
3577 | * take a look. | ||
3578 | */ | ||
3579 | spin_unlock_irq(&gcwq->lock); | ||
3580 | kthread_bind(gcwq->first_idle->task, cpu); | ||
3581 | spin_lock_irq(&gcwq->lock); | ||
3582 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | ||
3583 | start_worker(gcwq->first_idle); | ||
3584 | gcwq->first_idle = NULL; | ||
3585 | break; | 3473 | break; |
3586 | } | 3474 | } |
3475 | return NOTIFY_OK; | ||
3476 | } | ||
3587 | 3477 | ||
3588 | spin_unlock_irqrestore(&gcwq->lock, flags); | 3478 | /* |
3479 | * Workqueues should be brought down after normal priority CPU notifiers. | ||
3480 | * This will be registered as low priority CPU notifier. | ||
3481 | */ | ||
3482 | static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, | ||
3483 | unsigned long action, | ||
3484 | void *hcpu) | ||
3485 | { | ||
3486 | unsigned int cpu = (unsigned long)hcpu; | ||
3487 | struct work_struct unbind_work; | ||
3589 | 3488 | ||
3590 | return notifier_from_errno(0); | 3489 | switch (action & ~CPU_TASKS_FROZEN) { |
3490 | case CPU_DOWN_PREPARE: | ||
3491 | /* unbinding should happen on the local CPU */ | ||
3492 | INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); | ||
3493 | schedule_work_on(cpu, &unbind_work); | ||
3494 | flush_work(&unbind_work); | ||
3495 | break; | ||
3496 | } | ||
3497 | return NOTIFY_OK; | ||
3591 | } | 3498 | } |
3592 | 3499 | ||
3593 | #ifdef CONFIG_SMP | 3500 | #ifdef CONFIG_SMP |
@@ -3746,6 +3653,7 @@ void thaw_workqueues(void) | |||
3746 | 3653 | ||
3747 | for_each_gcwq_cpu(cpu) { | 3654 | for_each_gcwq_cpu(cpu) { |
3748 | struct global_cwq *gcwq = get_gcwq(cpu); | 3655 | struct global_cwq *gcwq = get_gcwq(cpu); |
3656 | struct worker_pool *pool; | ||
3749 | struct workqueue_struct *wq; | 3657 | struct workqueue_struct *wq; |
3750 | 3658 | ||
3751 | spin_lock_irq(&gcwq->lock); | 3659 | spin_lock_irq(&gcwq->lock); |
@@ -3767,7 +3675,8 @@ void thaw_workqueues(void) | |||
3767 | cwq_activate_first_delayed(cwq); | 3675 | cwq_activate_first_delayed(cwq); |
3768 | } | 3676 | } |
3769 | 3677 | ||
3770 | wake_up_worker(gcwq); | 3678 | for_each_worker_pool(pool, gcwq) |
3679 | wake_up_worker(pool); | ||
3771 | 3680 | ||
3772 | spin_unlock_irq(&gcwq->lock); | 3681 | spin_unlock_irq(&gcwq->lock); |
3773 | } | 3682 | } |
@@ -3783,46 +3692,57 @@ static int __init init_workqueues(void) | |||
3783 | unsigned int cpu; | 3692 | unsigned int cpu; |
3784 | int i; | 3693 | int i; |
3785 | 3694 | ||
3786 | cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); | 3695 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); |
3696 | cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); | ||
3787 | 3697 | ||
3788 | /* initialize gcwqs */ | 3698 | /* initialize gcwqs */ |
3789 | for_each_gcwq_cpu(cpu) { | 3699 | for_each_gcwq_cpu(cpu) { |
3790 | struct global_cwq *gcwq = get_gcwq(cpu); | 3700 | struct global_cwq *gcwq = get_gcwq(cpu); |
3701 | struct worker_pool *pool; | ||
3791 | 3702 | ||
3792 | spin_lock_init(&gcwq->lock); | 3703 | spin_lock_init(&gcwq->lock); |
3793 | INIT_LIST_HEAD(&gcwq->worklist); | ||
3794 | gcwq->cpu = cpu; | 3704 | gcwq->cpu = cpu; |
3795 | gcwq->flags |= GCWQ_DISASSOCIATED; | 3705 | gcwq->flags |= GCWQ_DISASSOCIATED; |
3796 | 3706 | ||
3797 | INIT_LIST_HEAD(&gcwq->idle_list); | ||
3798 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) | 3707 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) |
3799 | INIT_HLIST_HEAD(&gcwq->busy_hash[i]); | 3708 | INIT_HLIST_HEAD(&gcwq->busy_hash[i]); |
3800 | 3709 | ||
3801 | init_timer_deferrable(&gcwq->idle_timer); | 3710 | for_each_worker_pool(pool, gcwq) { |
3802 | gcwq->idle_timer.function = idle_worker_timeout; | 3711 | pool->gcwq = gcwq; |
3803 | gcwq->idle_timer.data = (unsigned long)gcwq; | 3712 | INIT_LIST_HEAD(&pool->worklist); |
3713 | INIT_LIST_HEAD(&pool->idle_list); | ||
3714 | |||
3715 | init_timer_deferrable(&pool->idle_timer); | ||
3716 | pool->idle_timer.function = idle_worker_timeout; | ||
3717 | pool->idle_timer.data = (unsigned long)pool; | ||
3804 | 3718 | ||
3805 | setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, | 3719 | setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, |
3806 | (unsigned long)gcwq); | 3720 | (unsigned long)pool); |
3807 | 3721 | ||
3808 | ida_init(&gcwq->worker_ida); | 3722 | mutex_init(&pool->manager_mutex); |
3723 | ida_init(&pool->worker_ida); | ||
3724 | } | ||
3809 | 3725 | ||
3810 | gcwq->trustee_state = TRUSTEE_DONE; | 3726 | init_waitqueue_head(&gcwq->rebind_hold); |
3811 | init_waitqueue_head(&gcwq->trustee_wait); | ||
3812 | } | 3727 | } |
3813 | 3728 | ||
3814 | /* create the initial worker */ | 3729 | /* create the initial worker */ |
3815 | for_each_online_gcwq_cpu(cpu) { | 3730 | for_each_online_gcwq_cpu(cpu) { |
3816 | struct global_cwq *gcwq = get_gcwq(cpu); | 3731 | struct global_cwq *gcwq = get_gcwq(cpu); |
3817 | struct worker *worker; | 3732 | struct worker_pool *pool; |
3818 | 3733 | ||
3819 | if (cpu != WORK_CPU_UNBOUND) | 3734 | if (cpu != WORK_CPU_UNBOUND) |
3820 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | 3735 | gcwq->flags &= ~GCWQ_DISASSOCIATED; |
3821 | worker = create_worker(gcwq, true); | 3736 | |
3822 | BUG_ON(!worker); | 3737 | for_each_worker_pool(pool, gcwq) { |
3823 | spin_lock_irq(&gcwq->lock); | 3738 | struct worker *worker; |
3824 | start_worker(worker); | 3739 | |
3825 | spin_unlock_irq(&gcwq->lock); | 3740 | worker = create_worker(pool); |
3741 | BUG_ON(!worker); | ||
3742 | spin_lock_irq(&gcwq->lock); | ||
3743 | start_worker(worker); | ||
3744 | spin_unlock_irq(&gcwq->lock); | ||
3745 | } | ||
3826 | } | 3746 | } |
3827 | 3747 | ||
3828 | system_wq = alloc_workqueue("events", 0, 0); | 3748 | system_wq = alloc_workqueue("events", 0, 0); |