diff options
author | Tejun Heo <tj@kernel.org> | 2013-04-01 22:08:06 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2013-04-01 22:08:06 -0400 |
commit | 839a8e8660b6777e7fe4e80af1a048aebe2b5977 (patch) | |
tree | 80398cd4dd8ebc4c51be20725c0cc427bfe321b3 /mm | |
parent | 181387da2d64c3129e5b5186c4dd388bc5041d53 (diff) |
writeback: replace custom worker pool implementation with unbound workqueue
Writeback implements its own worker pool - each bdi can be associated
with a worker thread which is created and destroyed dynamically. The
worker thread for the default bdi is always present and serves as the
"forker" thread which forks off worker threads for other bdis.
there's no reason for writeback to implement its own worker pool when
using unbound workqueue instead is much simpler and more efficient.
This patch replaces custom worker pool implementation in writeback
with an unbound workqueue.
The conversion isn't too complicated but the followings are worth
mentioning.
* bdi_writeback->last_active, task and wakeup_timer are removed.
delayed_work ->dwork is added instead. Explicit timer handling is
no longer necessary. Everything works by either queueing / modding
/ flushing / canceling the delayed_work item.
* bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off
bdi_writeback->dwork. On each execution, it processes
bdi->work_list and reschedules itself if there are more things to
do.
The function also handles low-mem condition, which used to be
handled by the forker thread. If the function is running off a
rescuer thread, it only writes out limited number of pages so that
the rescuer can serve other bdis too. This preserves the flusher
creation failure behavior of the forker thread.
* INIT_LIST_HEAD(&bdi->bdi_list) is used to tell
bdi_writeback_workfn() about on-going bdi unregistration so that it
always drains work_list even if it's running off the rescuer. Note
that the original code was broken in this regard. Under memory
pressure, a bdi could finish unregistration with non-empty
work_list.
* The default bdi is no longer special. It now is treated the same as
any other bdi and bdi_cap_flush_forker() is removed.
* BDI_pending is no longer used. Removed.
* Some tracepoints become non-applicable. The following TPs are
removed - writeback_nothread, writeback_wake_thread,
writeback_wake_forker_thread, writeback_thread_start,
writeback_thread_stop.
Everything, including devices coming and going away and rescuer
operation under simulated memory pressure, seems to work fine in my
test setup.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 255 |
1 files changed, 28 insertions, 227 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 657569b3fcf6..2857d4f6bca4 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -37,6 +37,9 @@ static struct class *bdi_class; | |||
37 | DEFINE_SPINLOCK(bdi_lock); | 37 | DEFINE_SPINLOCK(bdi_lock); |
38 | LIST_HEAD(bdi_list); | 38 | LIST_HEAD(bdi_list); |
39 | 39 | ||
40 | /* bdi_wq serves all asynchronous writeback tasks */ | ||
41 | struct workqueue_struct *bdi_wq; | ||
42 | |||
40 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | 43 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) |
41 | { | 44 | { |
42 | if (wb1 < wb2) { | 45 | if (wb1 < wb2) { |
@@ -255,6 +258,11 @@ static int __init default_bdi_init(void) | |||
255 | { | 258 | { |
256 | int err; | 259 | int err; |
257 | 260 | ||
261 | bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | | ||
262 | WQ_UNBOUND, 0); | ||
263 | if (!bdi_wq) | ||
264 | return -ENOMEM; | ||
265 | |||
258 | err = bdi_init(&default_backing_dev_info); | 266 | err = bdi_init(&default_backing_dev_info); |
259 | if (!err) | 267 | if (!err) |
260 | bdi_register(&default_backing_dev_info, NULL, "default"); | 268 | bdi_register(&default_backing_dev_info, NULL, "default"); |
@@ -269,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) | |||
269 | return wb_has_dirty_io(&bdi->wb); | 277 | return wb_has_dirty_io(&bdi->wb); |
270 | } | 278 | } |
271 | 279 | ||
272 | static void wakeup_timer_fn(unsigned long data) | ||
273 | { | ||
274 | struct backing_dev_info *bdi = (struct backing_dev_info *)data; | ||
275 | |||
276 | spin_lock_bh(&bdi->wb_lock); | ||
277 | if (bdi->wb.task) { | ||
278 | trace_writeback_wake_thread(bdi); | ||
279 | wake_up_process(bdi->wb.task); | ||
280 | } else if (bdi->dev) { | ||
281 | /* | ||
282 | * When bdi tasks are inactive for long time, they are killed. | ||
283 | * In this case we have to wake-up the forker thread which | ||
284 | * should create and run the bdi thread. | ||
285 | */ | ||
286 | trace_writeback_wake_forker_thread(bdi); | ||
287 | wake_up_process(default_backing_dev_info.wb.task); | ||
288 | } | ||
289 | spin_unlock_bh(&bdi->wb_lock); | ||
290 | } | ||
291 | |||
292 | /* | 280 | /* |
293 | * This function is used when the first inode for this bdi is marked dirty. It | 281 | * This function is used when the first inode for this bdi is marked dirty. It |
294 | * wakes-up the corresponding bdi thread which should then take care of the | 282 | * wakes-up the corresponding bdi thread which should then take care of the |
@@ -305,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) | |||
305 | unsigned long timeout; | 293 | unsigned long timeout; |
306 | 294 | ||
307 | timeout = msecs_to_jiffies(dirty_writeback_interval * 10); | 295 | timeout = msecs_to_jiffies(dirty_writeback_interval * 10); |
308 | mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); | 296 | mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); |
309 | } | ||
310 | |||
311 | /* | ||
312 | * Calculate the longest interval (jiffies) bdi threads are allowed to be | ||
313 | * inactive. | ||
314 | */ | ||
315 | static unsigned long bdi_longest_inactive(void) | ||
316 | { | ||
317 | unsigned long interval; | ||
318 | |||
319 | interval = msecs_to_jiffies(dirty_writeback_interval * 10); | ||
320 | return max(5UL * 60 * HZ, interval); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Clear pending bit and wakeup anybody waiting for flusher thread creation or | ||
325 | * shutdown | ||
326 | */ | ||
327 | static void bdi_clear_pending(struct backing_dev_info *bdi) | ||
328 | { | ||
329 | clear_bit(BDI_pending, &bdi->state); | ||
330 | smp_mb__after_clear_bit(); | ||
331 | wake_up_bit(&bdi->state, BDI_pending); | ||
332 | } | ||
333 | |||
334 | static int bdi_forker_thread(void *ptr) | ||
335 | { | ||
336 | struct bdi_writeback *me = ptr; | ||
337 | |||
338 | current->flags |= PF_SWAPWRITE; | ||
339 | set_freezable(); | ||
340 | |||
341 | /* | ||
342 | * Our parent may run at a different priority, just set us to normal | ||
343 | */ | ||
344 | set_user_nice(current, 0); | ||
345 | |||
346 | for (;;) { | ||
347 | struct task_struct *task = NULL; | ||
348 | struct backing_dev_info *bdi; | ||
349 | enum { | ||
350 | NO_ACTION, /* Nothing to do */ | ||
351 | FORK_THREAD, /* Fork bdi thread */ | ||
352 | KILL_THREAD, /* Kill inactive bdi thread */ | ||
353 | } action = NO_ACTION; | ||
354 | |||
355 | /* | ||
356 | * Temporary measure, we want to make sure we don't see | ||
357 | * dirty data on the default backing_dev_info | ||
358 | */ | ||
359 | if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { | ||
360 | del_timer(&me->wakeup_timer); | ||
361 | wb_do_writeback(me, 0); | ||
362 | } | ||
363 | |||
364 | spin_lock_bh(&bdi_lock); | ||
365 | /* | ||
366 | * In the following loop we are going to check whether we have | ||
367 | * some work to do without any synchronization with tasks | ||
368 | * waking us up to do work for them. Set the task state here | ||
369 | * so that we don't miss wakeups after verifying conditions. | ||
370 | */ | ||
371 | set_current_state(TASK_INTERRUPTIBLE); | ||
372 | |||
373 | list_for_each_entry(bdi, &bdi_list, bdi_list) { | ||
374 | bool have_dirty_io; | ||
375 | |||
376 | if (!bdi_cap_writeback_dirty(bdi) || | ||
377 | bdi_cap_flush_forker(bdi)) | ||
378 | continue; | ||
379 | |||
380 | WARN(!test_bit(BDI_registered, &bdi->state), | ||
381 | "bdi %p/%s is not registered!\n", bdi, bdi->name); | ||
382 | |||
383 | have_dirty_io = !list_empty(&bdi->work_list) || | ||
384 | wb_has_dirty_io(&bdi->wb); | ||
385 | |||
386 | /* | ||
387 | * If the bdi has work to do, but the thread does not | ||
388 | * exist - create it. | ||
389 | */ | ||
390 | if (!bdi->wb.task && have_dirty_io) { | ||
391 | /* | ||
392 | * Set the pending bit - if someone will try to | ||
393 | * unregister this bdi - it'll wait on this bit. | ||
394 | */ | ||
395 | set_bit(BDI_pending, &bdi->state); | ||
396 | action = FORK_THREAD; | ||
397 | break; | ||
398 | } | ||
399 | |||
400 | spin_lock(&bdi->wb_lock); | ||
401 | |||
402 | /* | ||
403 | * If there is no work to do and the bdi thread was | ||
404 | * inactive long enough - kill it. The wb_lock is taken | ||
405 | * to make sure no-one adds more work to this bdi and | ||
406 | * wakes the bdi thread up. | ||
407 | */ | ||
408 | if (bdi->wb.task && !have_dirty_io && | ||
409 | time_after(jiffies, bdi->wb.last_active + | ||
410 | bdi_longest_inactive())) { | ||
411 | task = bdi->wb.task; | ||
412 | bdi->wb.task = NULL; | ||
413 | spin_unlock(&bdi->wb_lock); | ||
414 | set_bit(BDI_pending, &bdi->state); | ||
415 | action = KILL_THREAD; | ||
416 | break; | ||
417 | } | ||
418 | spin_unlock(&bdi->wb_lock); | ||
419 | } | ||
420 | spin_unlock_bh(&bdi_lock); | ||
421 | |||
422 | /* Keep working if default bdi still has things to do */ | ||
423 | if (!list_empty(&me->bdi->work_list)) | ||
424 | __set_current_state(TASK_RUNNING); | ||
425 | |||
426 | switch (action) { | ||
427 | case FORK_THREAD: | ||
428 | __set_current_state(TASK_RUNNING); | ||
429 | task = kthread_create(bdi_writeback_thread, &bdi->wb, | ||
430 | "flush-%s", dev_name(bdi->dev)); | ||
431 | if (IS_ERR(task)) { | ||
432 | /* | ||
433 | * If thread creation fails, force writeout of | ||
434 | * the bdi from the thread. Hopefully 1024 is | ||
435 | * large enough for efficient IO. | ||
436 | */ | ||
437 | writeback_inodes_wb(&bdi->wb, 1024, | ||
438 | WB_REASON_FORKER_THREAD); | ||
439 | } else { | ||
440 | /* | ||
441 | * The spinlock makes sure we do not lose | ||
442 | * wake-ups when racing with 'bdi_queue_work()'. | ||
443 | * And as soon as the bdi thread is visible, we | ||
444 | * can start it. | ||
445 | */ | ||
446 | spin_lock_bh(&bdi->wb_lock); | ||
447 | bdi->wb.task = task; | ||
448 | spin_unlock_bh(&bdi->wb_lock); | ||
449 | wake_up_process(task); | ||
450 | } | ||
451 | bdi_clear_pending(bdi); | ||
452 | break; | ||
453 | |||
454 | case KILL_THREAD: | ||
455 | __set_current_state(TASK_RUNNING); | ||
456 | kthread_stop(task); | ||
457 | bdi_clear_pending(bdi); | ||
458 | break; | ||
459 | |||
460 | case NO_ACTION: | ||
461 | if (!wb_has_dirty_io(me) || !dirty_writeback_interval) | ||
462 | /* | ||
463 | * There are no dirty data. The only thing we | ||
464 | * should now care about is checking for | ||
465 | * inactive bdi threads and killing them. Thus, | ||
466 | * let's sleep for longer time, save energy and | ||
467 | * be friendly for battery-driven devices. | ||
468 | */ | ||
469 | schedule_timeout(bdi_longest_inactive()); | ||
470 | else | ||
471 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); | ||
472 | try_to_freeze(); | ||
473 | break; | ||
474 | } | ||
475 | } | ||
476 | |||
477 | return 0; | ||
478 | } | 297 | } |
479 | 298 | ||
480 | /* | 299 | /* |
@@ -487,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) | |||
487 | spin_unlock_bh(&bdi_lock); | 306 | spin_unlock_bh(&bdi_lock); |
488 | 307 | ||
489 | synchronize_rcu_expedited(); | 308 | synchronize_rcu_expedited(); |
309 | |||
310 | /* bdi_list is now unused, clear it to mark @bdi dying */ | ||
311 | INIT_LIST_HEAD(&bdi->bdi_list); | ||
490 | } | 312 | } |
491 | 313 | ||
492 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 314 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
@@ -506,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, | |||
506 | 328 | ||
507 | bdi->dev = dev; | 329 | bdi->dev = dev; |
508 | 330 | ||
509 | /* | ||
510 | * Just start the forker thread for our default backing_dev_info, | ||
511 | * and add other bdi's to the list. They will get a thread created | ||
512 | * on-demand when they need it. | ||
513 | */ | ||
514 | if (bdi_cap_flush_forker(bdi)) { | ||
515 | struct bdi_writeback *wb = &bdi->wb; | ||
516 | |||
517 | wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", | ||
518 | dev_name(dev)); | ||
519 | if (IS_ERR(wb->task)) | ||
520 | return PTR_ERR(wb->task); | ||
521 | } | ||
522 | |||
523 | bdi_debug_register(bdi, dev_name(dev)); | 331 | bdi_debug_register(bdi, dev_name(dev)); |
524 | set_bit(BDI_registered, &bdi->state); | 332 | set_bit(BDI_registered, &bdi->state); |
525 | 333 | ||
@@ -543,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev); | |||
543 | */ | 351 | */ |
544 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) | 352 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) |
545 | { | 353 | { |
546 | struct task_struct *task; | ||
547 | |||
548 | if (!bdi_cap_writeback_dirty(bdi)) | 354 | if (!bdi_cap_writeback_dirty(bdi)) |
549 | return; | 355 | return; |
550 | 356 | ||
@@ -554,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) | |||
554 | bdi_remove_from_list(bdi); | 360 | bdi_remove_from_list(bdi); |
555 | 361 | ||
556 | /* | 362 | /* |
557 | * If setup is pending, wait for that to complete first | 363 | * Drain work list and shutdown the delayed_work. At this point, |
364 | * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi | ||
365 | * is dying and its work_list needs to be drained no matter what. | ||
558 | */ | 366 | */ |
559 | wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, | 367 | mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); |
560 | TASK_UNINTERRUPTIBLE); | 368 | flush_delayed_work(&bdi->wb.dwork); |
369 | WARN_ON(!list_empty(&bdi->work_list)); | ||
561 | 370 | ||
562 | /* | 371 | /* |
563 | * Finally, kill the kernel thread. We don't need to be RCU | 372 | * This shouldn't be necessary unless @bdi for some reason has |
564 | * safe anymore, since the bdi is gone from visibility. | 373 | * unflushed dirty IO after work_list is drained. Do it anyway |
374 | * just in case. | ||
565 | */ | 375 | */ |
566 | spin_lock_bh(&bdi->wb_lock); | 376 | cancel_delayed_work_sync(&bdi->wb.dwork); |
567 | task = bdi->wb.task; | ||
568 | bdi->wb.task = NULL; | ||
569 | spin_unlock_bh(&bdi->wb_lock); | ||
570 | |||
571 | if (task) | ||
572 | kthread_stop(task); | ||
573 | } | 377 | } |
574 | 378 | ||
575 | /* | 379 | /* |
@@ -595,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi) | |||
595 | bdi_set_min_ratio(bdi, 0); | 399 | bdi_set_min_ratio(bdi, 0); |
596 | trace_writeback_bdi_unregister(bdi); | 400 | trace_writeback_bdi_unregister(bdi); |
597 | bdi_prune_sb(bdi); | 401 | bdi_prune_sb(bdi); |
598 | del_timer_sync(&bdi->wb.wakeup_timer); | ||
599 | 402 | ||
600 | if (!bdi_cap_flush_forker(bdi)) | 403 | bdi_wb_shutdown(bdi); |
601 | bdi_wb_shutdown(bdi); | ||
602 | bdi_debug_unregister(bdi); | 404 | bdi_debug_unregister(bdi); |
603 | 405 | ||
604 | spin_lock_bh(&bdi->wb_lock); | 406 | spin_lock_bh(&bdi->wb_lock); |
@@ -620,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
620 | INIT_LIST_HEAD(&wb->b_io); | 422 | INIT_LIST_HEAD(&wb->b_io); |
621 | INIT_LIST_HEAD(&wb->b_more_io); | 423 | INIT_LIST_HEAD(&wb->b_more_io); |
622 | spin_lock_init(&wb->list_lock); | 424 | spin_lock_init(&wb->list_lock); |
623 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 425 | INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); |
624 | } | 426 | } |
625 | 427 | ||
626 | /* | 428 | /* |
@@ -693,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
693 | bdi_unregister(bdi); | 495 | bdi_unregister(bdi); |
694 | 496 | ||
695 | /* | 497 | /* |
696 | * If bdi_unregister() had already been called earlier, the | 498 | * If bdi_unregister() had already been called earlier, the dwork |
697 | * wakeup_timer could still be armed because bdi_prune_sb() | 499 | * could still be pending because bdi_prune_sb() can race with the |
698 | * can race with the bdi_wakeup_thread_delayed() calls from | 500 | * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). |
699 | * __mark_inode_dirty(). | ||
700 | */ | 501 | */ |
701 | del_timer_sync(&bdi->wb.wakeup_timer); | 502 | cancel_delayed_work_sync(&bdi->wb.dwork); |
702 | 503 | ||
703 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 504 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
704 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 505 | percpu_counter_destroy(&bdi->bdi_stat[i]); |