aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-04-01 22:08:06 -0400
committerTejun Heo <tj@kernel.org>2013-04-01 22:08:06 -0400
commit839a8e8660b6777e7fe4e80af1a048aebe2b5977 (patch)
tree80398cd4dd8ebc4c51be20725c0cc427bfe321b3 /fs/fs-writeback.c
parent181387da2d64c3129e5b5186c4dd388bc5041d53 (diff)
writeback: replace custom worker pool implementation with unbound workqueue
Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Jens Axboe <axboe@kernel.dk> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Jeff Moyer <jmoyer@redhat.com>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c102
1 files changed, 32 insertions, 70 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 21f46fb3a101..8067d3719e94 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/freezer.h>
26#include <linux/writeback.h> 25#include <linux/writeback.h>
27#include <linux/blkdev.h> 26#include <linux/blkdev.h>
28#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
@@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head)
88#define CREATE_TRACE_POINTS 87#define CREATE_TRACE_POINTS
89#include <trace/events/writeback.h> 88#include <trace/events/writeback.h>
90 89
91/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
92static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
93{
94 if (bdi->wb.task) {
95 wake_up_process(bdi->wb.task);
96 } else {
97 /*
98 * The bdi thread isn't there, wake up the forker thread which
99 * will create and run it.
100 */
101 wake_up_process(default_backing_dev_info.wb.task);
102 }
103}
104
105static void bdi_queue_work(struct backing_dev_info *bdi, 90static void bdi_queue_work(struct backing_dev_info *bdi,
106 struct wb_writeback_work *work) 91 struct wb_writeback_work *work)
107{ 92{
@@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
109 94
110 spin_lock_bh(&bdi->wb_lock); 95 spin_lock_bh(&bdi->wb_lock);
111 list_add_tail(&work->list, &bdi->work_list); 96 list_add_tail(&work->list, &bdi->work_list);
112 if (!bdi->wb.task)
113 trace_writeback_nothread(bdi, work);
114 bdi_wakeup_flusher(bdi);
115 spin_unlock_bh(&bdi->wb_lock); 97 spin_unlock_bh(&bdi->wb_lock);
98
99 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
116} 100}
117 101
118static void 102static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
127 */ 111 */
128 work = kzalloc(sizeof(*work), GFP_ATOMIC); 112 work = kzalloc(sizeof(*work), GFP_ATOMIC);
129 if (!work) { 113 if (!work) {
130 if (bdi->wb.task) { 114 trace_writeback_nowork(bdi);
131 trace_writeback_nowork(bdi); 115 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
132 wake_up_process(bdi->wb.task);
133 }
134 return; 116 return;
135 } 117 }
136 118
@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
177 * writeback as soon as there is no other work to do. 159 * writeback as soon as there is no other work to do.
178 */ 160 */
179 trace_writeback_wake_background(bdi); 161 trace_writeback_wake_background(bdi);
180 spin_lock_bh(&bdi->wb_lock); 162 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
181 bdi_wakeup_flusher(bdi);
182 spin_unlock_bh(&bdi->wb_lock);
183} 163}
184 164
185/* 165/*
@@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
1020 1000
1021/* 1001/*
1022 * Handle writeback of dirty data for the device backed by this bdi. Also 1002 * Handle writeback of dirty data for the device backed by this bdi. Also
1023 * wakes up periodically and does kupdated style flushing. 1003 * reschedules periodically and does kupdated style flushing.
1024 */ 1004 */
1025int bdi_writeback_thread(void *data) 1005void bdi_writeback_workfn(struct work_struct *work)
1026{ 1006{
1027 struct bdi_writeback *wb = data; 1007 struct bdi_writeback *wb = container_of(to_delayed_work(work),
1008 struct bdi_writeback, dwork);
1028 struct backing_dev_info *bdi = wb->bdi; 1009 struct backing_dev_info *bdi = wb->bdi;
1029 long pages_written; 1010 long pages_written;
1030 1011
1031 current->flags |= PF_SWAPWRITE; 1012 current->flags |= PF_SWAPWRITE;
1032 set_freezable();
1033 wb->last_active = jiffies;
1034
1035 /*
1036 * Our parent may run at a different priority, just set us to normal
1037 */
1038 set_user_nice(current, 0);
1039
1040 trace_writeback_thread_start(bdi);
1041 1013
1042 while (!kthread_freezable_should_stop(NULL)) { 1014 if (likely(!current_is_workqueue_rescuer() ||
1015 list_empty(&bdi->bdi_list))) {
1043 /* 1016 /*
1044 * Remove own delayed wake-up timer, since we are already awake 1017 * The normal path. Keep writing back @bdi until its
1045 * and we'll take care of the periodic write-back. 1018 * work_list is empty. Note that this path is also taken
1019 * if @bdi is shutting down even when we're running off the
1020 * rescuer as work_list needs to be drained.
1046 */ 1021 */
1047 del_timer(&wb->wakeup_timer); 1022 do {
1048 1023 pages_written = wb_do_writeback(wb, 0);
1049 pages_written = wb_do_writeback(wb, 0); 1024 trace_writeback_pages_written(pages_written);
1050 1025 } while (!list_empty(&bdi->work_list));
1026 } else {
1027 /*
1028 * bdi_wq can't get enough workers and we're running off
1029 * the emergency worker. Don't hog it. Hopefully, 1024 is
1030 * enough for efficient IO.
1031 */
1032 pages_written = writeback_inodes_wb(&bdi->wb, 1024,
1033 WB_REASON_FORKER_THREAD);
1051 trace_writeback_pages_written(pages_written); 1034 trace_writeback_pages_written(pages_written);
1052
1053 if (pages_written)
1054 wb->last_active = jiffies;
1055
1056 set_current_state(TASK_INTERRUPTIBLE);
1057 if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
1058 __set_current_state(TASK_RUNNING);
1059 continue;
1060 }
1061
1062 if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1063 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
1064 else {
1065 /*
1066 * We have nothing to do, so can go sleep without any
1067 * timeout and save power. When a work is queued or
1068 * something is made dirty - we will be woken up.
1069 */
1070 schedule();
1071 }
1072 } 1035 }
1073 1036
1074 /* Flush any work that raced with us exiting */ 1037 if (!list_empty(&bdi->work_list) ||
1075 if (!list_empty(&bdi->work_list)) 1038 (wb_has_dirty_io(wb) && dirty_writeback_interval))
1076 wb_do_writeback(wb, 1); 1039 queue_delayed_work(bdi_wq, &wb->dwork,
1040 msecs_to_jiffies(dirty_writeback_interval * 10));
1077 1041
1078 trace_writeback_thread_stop(bdi); 1042 current->flags &= ~PF_SWAPWRITE;
1079 return 0;
1080} 1043}
1081 1044
1082
1083/* 1045/*
1084 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 1046 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
1085 * the whole world. 1047 * the whole world.