aboutsummaryrefslogtreecommitdiffstats
path: root/mm/backing-dev.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-04-01 22:08:06 -0400
committerTejun Heo <tj@kernel.org>2013-04-01 22:08:06 -0400
commit839a8e8660b6777e7fe4e80af1a048aebe2b5977 (patch)
tree80398cd4dd8ebc4c51be20725c0cc427bfe321b3 /mm/backing-dev.c
parent181387da2d64c3129e5b5186c4dd388bc5041d53 (diff)
writeback: replace custom worker pool implementation with unbound workqueue
Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Jens Axboe <axboe@kernel.dk> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Jeff Moyer <jmoyer@redhat.com>
Diffstat (limited to 'mm/backing-dev.c')
-rw-r--r--mm/backing-dev.c255
1 files changed, 28 insertions, 227 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 657569b3fcf6..2857d4f6bca4 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -37,6 +37,9 @@ static struct class *bdi_class;
37DEFINE_SPINLOCK(bdi_lock); 37DEFINE_SPINLOCK(bdi_lock);
38LIST_HEAD(bdi_list); 38LIST_HEAD(bdi_list);
39 39
40/* bdi_wq serves all asynchronous writeback tasks */
41struct workqueue_struct *bdi_wq;
42
40void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 43void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
41{ 44{
42 if (wb1 < wb2) { 45 if (wb1 < wb2) {
@@ -255,6 +258,11 @@ static int __init default_bdi_init(void)
255{ 258{
256 int err; 259 int err;
257 260
261 bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
262 WQ_UNBOUND, 0);
263 if (!bdi_wq)
264 return -ENOMEM;
265
258 err = bdi_init(&default_backing_dev_info); 266 err = bdi_init(&default_backing_dev_info);
259 if (!err) 267 if (!err)
260 bdi_register(&default_backing_dev_info, NULL, "default"); 268 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -269,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
269 return wb_has_dirty_io(&bdi->wb); 277 return wb_has_dirty_io(&bdi->wb);
270} 278}
271 279
272static void wakeup_timer_fn(unsigned long data)
273{
274 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
275
276 spin_lock_bh(&bdi->wb_lock);
277 if (bdi->wb.task) {
278 trace_writeback_wake_thread(bdi);
279 wake_up_process(bdi->wb.task);
280 } else if (bdi->dev) {
281 /*
282 * When bdi tasks are inactive for long time, they are killed.
283 * In this case we have to wake-up the forker thread which
284 * should create and run the bdi thread.
285 */
286 trace_writeback_wake_forker_thread(bdi);
287 wake_up_process(default_backing_dev_info.wb.task);
288 }
289 spin_unlock_bh(&bdi->wb_lock);
290}
291
292/* 280/*
293 * This function is used when the first inode for this bdi is marked dirty. It 281 * This function is used when the first inode for this bdi is marked dirty. It
294 * wakes-up the corresponding bdi thread which should then take care of the 282 * wakes-up the corresponding bdi thread which should then take care of the
@@ -305,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
305 unsigned long timeout; 293 unsigned long timeout;
306 294
307 timeout = msecs_to_jiffies(dirty_writeback_interval * 10); 295 timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
308 mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); 296 mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
309}
310
311/*
312 * Calculate the longest interval (jiffies) bdi threads are allowed to be
313 * inactive.
314 */
315static unsigned long bdi_longest_inactive(void)
316{
317 unsigned long interval;
318
319 interval = msecs_to_jiffies(dirty_writeback_interval * 10);
320 return max(5UL * 60 * HZ, interval);
321}
322
323/*
324 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
325 * shutdown
326 */
327static void bdi_clear_pending(struct backing_dev_info *bdi)
328{
329 clear_bit(BDI_pending, &bdi->state);
330 smp_mb__after_clear_bit();
331 wake_up_bit(&bdi->state, BDI_pending);
332}
333
334static int bdi_forker_thread(void *ptr)
335{
336 struct bdi_writeback *me = ptr;
337
338 current->flags |= PF_SWAPWRITE;
339 set_freezable();
340
341 /*
342 * Our parent may run at a different priority, just set us to normal
343 */
344 set_user_nice(current, 0);
345
346 for (;;) {
347 struct task_struct *task = NULL;
348 struct backing_dev_info *bdi;
349 enum {
350 NO_ACTION, /* Nothing to do */
351 FORK_THREAD, /* Fork bdi thread */
352 KILL_THREAD, /* Kill inactive bdi thread */
353 } action = NO_ACTION;
354
355 /*
356 * Temporary measure, we want to make sure we don't see
357 * dirty data on the default backing_dev_info
358 */
359 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
360 del_timer(&me->wakeup_timer);
361 wb_do_writeback(me, 0);
362 }
363
364 spin_lock_bh(&bdi_lock);
365 /*
366 * In the following loop we are going to check whether we have
367 * some work to do without any synchronization with tasks
368 * waking us up to do work for them. Set the task state here
369 * so that we don't miss wakeups after verifying conditions.
370 */
371 set_current_state(TASK_INTERRUPTIBLE);
372
373 list_for_each_entry(bdi, &bdi_list, bdi_list) {
374 bool have_dirty_io;
375
376 if (!bdi_cap_writeback_dirty(bdi) ||
377 bdi_cap_flush_forker(bdi))
378 continue;
379
380 WARN(!test_bit(BDI_registered, &bdi->state),
381 "bdi %p/%s is not registered!\n", bdi, bdi->name);
382
383 have_dirty_io = !list_empty(&bdi->work_list) ||
384 wb_has_dirty_io(&bdi->wb);
385
386 /*
387 * If the bdi has work to do, but the thread does not
388 * exist - create it.
389 */
390 if (!bdi->wb.task && have_dirty_io) {
391 /*
392 * Set the pending bit - if someone will try to
393 * unregister this bdi - it'll wait on this bit.
394 */
395 set_bit(BDI_pending, &bdi->state);
396 action = FORK_THREAD;
397 break;
398 }
399
400 spin_lock(&bdi->wb_lock);
401
402 /*
403 * If there is no work to do and the bdi thread was
404 * inactive long enough - kill it. The wb_lock is taken
405 * to make sure no-one adds more work to this bdi and
406 * wakes the bdi thread up.
407 */
408 if (bdi->wb.task && !have_dirty_io &&
409 time_after(jiffies, bdi->wb.last_active +
410 bdi_longest_inactive())) {
411 task = bdi->wb.task;
412 bdi->wb.task = NULL;
413 spin_unlock(&bdi->wb_lock);
414 set_bit(BDI_pending, &bdi->state);
415 action = KILL_THREAD;
416 break;
417 }
418 spin_unlock(&bdi->wb_lock);
419 }
420 spin_unlock_bh(&bdi_lock);
421
422 /* Keep working if default bdi still has things to do */
423 if (!list_empty(&me->bdi->work_list))
424 __set_current_state(TASK_RUNNING);
425
426 switch (action) {
427 case FORK_THREAD:
428 __set_current_state(TASK_RUNNING);
429 task = kthread_create(bdi_writeback_thread, &bdi->wb,
430 "flush-%s", dev_name(bdi->dev));
431 if (IS_ERR(task)) {
432 /*
433 * If thread creation fails, force writeout of
434 * the bdi from the thread. Hopefully 1024 is
435 * large enough for efficient IO.
436 */
437 writeback_inodes_wb(&bdi->wb, 1024,
438 WB_REASON_FORKER_THREAD);
439 } else {
440 /*
441 * The spinlock makes sure we do not lose
442 * wake-ups when racing with 'bdi_queue_work()'.
443 * And as soon as the bdi thread is visible, we
444 * can start it.
445 */
446 spin_lock_bh(&bdi->wb_lock);
447 bdi->wb.task = task;
448 spin_unlock_bh(&bdi->wb_lock);
449 wake_up_process(task);
450 }
451 bdi_clear_pending(bdi);
452 break;
453
454 case KILL_THREAD:
455 __set_current_state(TASK_RUNNING);
456 kthread_stop(task);
457 bdi_clear_pending(bdi);
458 break;
459
460 case NO_ACTION:
461 if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
462 /*
463 * There are no dirty data. The only thing we
464 * should now care about is checking for
465 * inactive bdi threads and killing them. Thus,
466 * let's sleep for longer time, save energy and
467 * be friendly for battery-driven devices.
468 */
469 schedule_timeout(bdi_longest_inactive());
470 else
471 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
472 try_to_freeze();
473 break;
474 }
475 }
476
477 return 0;
478} 297}
479 298
480/* 299/*
@@ -487,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
487 spin_unlock_bh(&bdi_lock); 306 spin_unlock_bh(&bdi_lock);
488 307
489 synchronize_rcu_expedited(); 308 synchronize_rcu_expedited();
309
310 /* bdi_list is now unused, clear it to mark @bdi dying */
311 INIT_LIST_HEAD(&bdi->bdi_list);
490} 312}
491 313
492int bdi_register(struct backing_dev_info *bdi, struct device *parent, 314int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -506,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
506 328
507 bdi->dev = dev; 329 bdi->dev = dev;
508 330
509 /*
510 * Just start the forker thread for our default backing_dev_info,
511 * and add other bdi's to the list. They will get a thread created
512 * on-demand when they need it.
513 */
514 if (bdi_cap_flush_forker(bdi)) {
515 struct bdi_writeback *wb = &bdi->wb;
516
517 wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
518 dev_name(dev));
519 if (IS_ERR(wb->task))
520 return PTR_ERR(wb->task);
521 }
522
523 bdi_debug_register(bdi, dev_name(dev)); 331 bdi_debug_register(bdi, dev_name(dev));
524 set_bit(BDI_registered, &bdi->state); 332 set_bit(BDI_registered, &bdi->state);
525 333
@@ -543,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
543 */ 351 */
544static void bdi_wb_shutdown(struct backing_dev_info *bdi) 352static void bdi_wb_shutdown(struct backing_dev_info *bdi)
545{ 353{
546 struct task_struct *task;
547
548 if (!bdi_cap_writeback_dirty(bdi)) 354 if (!bdi_cap_writeback_dirty(bdi))
549 return; 355 return;
550 356
@@ -554,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
554 bdi_remove_from_list(bdi); 360 bdi_remove_from_list(bdi);
555 361
556 /* 362 /*
557 * If setup is pending, wait for that to complete first 363 * Drain work list and shutdown the delayed_work. At this point,
364 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
365 * is dying and its work_list needs to be drained no matter what.
558 */ 366 */
559 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, 367 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
560 TASK_UNINTERRUPTIBLE); 368 flush_delayed_work(&bdi->wb.dwork);
369 WARN_ON(!list_empty(&bdi->work_list));
561 370
562 /* 371 /*
563 * Finally, kill the kernel thread. We don't need to be RCU 372 * This shouldn't be necessary unless @bdi for some reason has
564 * safe anymore, since the bdi is gone from visibility. 373 * unflushed dirty IO after work_list is drained. Do it anyway
374 * just in case.
565 */ 375 */
566 spin_lock_bh(&bdi->wb_lock); 376 cancel_delayed_work_sync(&bdi->wb.dwork);
567 task = bdi->wb.task;
568 bdi->wb.task = NULL;
569 spin_unlock_bh(&bdi->wb_lock);
570
571 if (task)
572 kthread_stop(task);
573} 377}
574 378
575/* 379/*
@@ -595,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
595 bdi_set_min_ratio(bdi, 0); 399 bdi_set_min_ratio(bdi, 0);
596 trace_writeback_bdi_unregister(bdi); 400 trace_writeback_bdi_unregister(bdi);
597 bdi_prune_sb(bdi); 401 bdi_prune_sb(bdi);
598 del_timer_sync(&bdi->wb.wakeup_timer);
599 402
600 if (!bdi_cap_flush_forker(bdi)) 403 bdi_wb_shutdown(bdi);
601 bdi_wb_shutdown(bdi);
602 bdi_debug_unregister(bdi); 404 bdi_debug_unregister(bdi);
603 405
604 spin_lock_bh(&bdi->wb_lock); 406 spin_lock_bh(&bdi->wb_lock);
@@ -620,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
620 INIT_LIST_HEAD(&wb->b_io); 422 INIT_LIST_HEAD(&wb->b_io);
621 INIT_LIST_HEAD(&wb->b_more_io); 423 INIT_LIST_HEAD(&wb->b_more_io);
622 spin_lock_init(&wb->list_lock); 424 spin_lock_init(&wb->list_lock);
623 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 425 INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
624} 426}
625 427
626/* 428/*
@@ -693,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
693 bdi_unregister(bdi); 495 bdi_unregister(bdi);
694 496
695 /* 497 /*
696 * If bdi_unregister() had already been called earlier, the 498 * If bdi_unregister() had already been called earlier, the dwork
697 * wakeup_timer could still be armed because bdi_prune_sb() 499 * could still be pending because bdi_prune_sb() can race with the
698 * can race with the bdi_wakeup_thread_delayed() calls from 500 * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
699 * __mark_inode_dirty().
700 */ 501 */
701 del_timer_sync(&bdi->wb.wakeup_timer); 502 cancel_delayed_work_sync(&bdi->wb.dwork);
702 503
703 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 504 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
704 percpu_counter_destroy(&bdi->bdi_stat[i]); 505 percpu_counter_destroy(&bdi->bdi_stat[i]);