aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-04-01 22:08:06 -0400
committerTejun Heo <tj@kernel.org>2013-04-01 22:08:06 -0400
commit839a8e8660b6777e7fe4e80af1a048aebe2b5977 (patch)
tree80398cd4dd8ebc4c51be20725c0cc427bfe321b3
parent181387da2d64c3129e5b5186c4dd388bc5041d53 (diff)
writeback: replace custom worker pool implementation with unbound workqueue
Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Jens Axboe <axboe@kernel.dk> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Jeff Moyer <jmoyer@redhat.com>
-rw-r--r--fs/fs-writeback.c102
-rw-r--r--include/linux/backing-dev.h15
-rw-r--r--include/trace/events/writeback.h5
-rw-r--r--mm/backing-dev.c255
4 files changed, 65 insertions, 312 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 21f46fb3a101..8067d3719e94 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/freezer.h>
26#include <linux/writeback.h> 25#include <linux/writeback.h>
27#include <linux/blkdev.h> 26#include <linux/blkdev.h>
28#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
@@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head)
88#define CREATE_TRACE_POINTS 87#define CREATE_TRACE_POINTS
89#include <trace/events/writeback.h> 88#include <trace/events/writeback.h>
90 89
91/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
92static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
93{
94 if (bdi->wb.task) {
95 wake_up_process(bdi->wb.task);
96 } else {
97 /*
98 * The bdi thread isn't there, wake up the forker thread which
99 * will create and run it.
100 */
101 wake_up_process(default_backing_dev_info.wb.task);
102 }
103}
104
105static void bdi_queue_work(struct backing_dev_info *bdi, 90static void bdi_queue_work(struct backing_dev_info *bdi,
106 struct wb_writeback_work *work) 91 struct wb_writeback_work *work)
107{ 92{
@@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
109 94
110 spin_lock_bh(&bdi->wb_lock); 95 spin_lock_bh(&bdi->wb_lock);
111 list_add_tail(&work->list, &bdi->work_list); 96 list_add_tail(&work->list, &bdi->work_list);
112 if (!bdi->wb.task)
113 trace_writeback_nothread(bdi, work);
114 bdi_wakeup_flusher(bdi);
115 spin_unlock_bh(&bdi->wb_lock); 97 spin_unlock_bh(&bdi->wb_lock);
98
99 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
116} 100}
117 101
118static void 102static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
127 */ 111 */
128 work = kzalloc(sizeof(*work), GFP_ATOMIC); 112 work = kzalloc(sizeof(*work), GFP_ATOMIC);
129 if (!work) { 113 if (!work) {
130 if (bdi->wb.task) { 114 trace_writeback_nowork(bdi);
131 trace_writeback_nowork(bdi); 115 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
132 wake_up_process(bdi->wb.task);
133 }
134 return; 116 return;
135 } 117 }
136 118
@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
177 * writeback as soon as there is no other work to do. 159 * writeback as soon as there is no other work to do.
178 */ 160 */
179 trace_writeback_wake_background(bdi); 161 trace_writeback_wake_background(bdi);
180 spin_lock_bh(&bdi->wb_lock); 162 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
181 bdi_wakeup_flusher(bdi);
182 spin_unlock_bh(&bdi->wb_lock);
183} 163}
184 164
185/* 165/*
@@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
1020 1000
1021/* 1001/*
1022 * Handle writeback of dirty data for the device backed by this bdi. Also 1002 * Handle writeback of dirty data for the device backed by this bdi. Also
1023 * wakes up periodically and does kupdated style flushing. 1003 * reschedules periodically and does kupdated style flushing.
1024 */ 1004 */
1025int bdi_writeback_thread(void *data) 1005void bdi_writeback_workfn(struct work_struct *work)
1026{ 1006{
1027 struct bdi_writeback *wb = data; 1007 struct bdi_writeback *wb = container_of(to_delayed_work(work),
1008 struct bdi_writeback, dwork);
1028 struct backing_dev_info *bdi = wb->bdi; 1009 struct backing_dev_info *bdi = wb->bdi;
1029 long pages_written; 1010 long pages_written;
1030 1011
1031 current->flags |= PF_SWAPWRITE; 1012 current->flags |= PF_SWAPWRITE;
1032 set_freezable();
1033 wb->last_active = jiffies;
1034
1035 /*
1036 * Our parent may run at a different priority, just set us to normal
1037 */
1038 set_user_nice(current, 0);
1039
1040 trace_writeback_thread_start(bdi);
1041 1013
1042 while (!kthread_freezable_should_stop(NULL)) { 1014 if (likely(!current_is_workqueue_rescuer() ||
1015 list_empty(&bdi->bdi_list))) {
1043 /* 1016 /*
1044 * Remove own delayed wake-up timer, since we are already awake 1017 * The normal path. Keep writing back @bdi until its
1045 * and we'll take care of the periodic write-back. 1018 * work_list is empty. Note that this path is also taken
1019 * if @bdi is shutting down even when we're running off the
1020 * rescuer as work_list needs to be drained.
1046 */ 1021 */
1047 del_timer(&wb->wakeup_timer); 1022 do {
1048 1023 pages_written = wb_do_writeback(wb, 0);
1049 pages_written = wb_do_writeback(wb, 0); 1024 trace_writeback_pages_written(pages_written);
1050 1025 } while (!list_empty(&bdi->work_list));
1026 } else {
1027 /*
1028 * bdi_wq can't get enough workers and we're running off
1029 * the emergency worker. Don't hog it. Hopefully, 1024 is
1030 * enough for efficient IO.
1031 */
1032 pages_written = writeback_inodes_wb(&bdi->wb, 1024,
1033 WB_REASON_FORKER_THREAD);
1051 trace_writeback_pages_written(pages_written); 1034 trace_writeback_pages_written(pages_written);
1052
1053 if (pages_written)
1054 wb->last_active = jiffies;
1055
1056 set_current_state(TASK_INTERRUPTIBLE);
1057 if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
1058 __set_current_state(TASK_RUNNING);
1059 continue;
1060 }
1061
1062 if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1063 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
1064 else {
1065 /*
1066 * We have nothing to do, so can go sleep without any
1067 * timeout and save power. When a work is queued or
1068 * something is made dirty - we will be woken up.
1069 */
1070 schedule();
1071 }
1072 } 1035 }
1073 1036
1074 /* Flush any work that raced with us exiting */ 1037 if (!list_empty(&bdi->work_list) ||
1075 if (!list_empty(&bdi->work_list)) 1038 (wb_has_dirty_io(wb) && dirty_writeback_interval))
1076 wb_do_writeback(wb, 1); 1039 queue_delayed_work(bdi_wq, &wb->dwork,
1040 msecs_to_jiffies(dirty_writeback_interval * 10));
1077 1041
1078 trace_writeback_thread_stop(bdi); 1042 current->flags &= ~PF_SWAPWRITE;
1079 return 0;
1080} 1043}
1081 1044
1082
1083/* 1045/*
1084 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 1046 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
1085 * the whole world. 1047 * the whole world.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index a5ef27f5411a..c3881553f7d1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -18,6 +18,7 @@
18#include <linux/writeback.h> 18#include <linux/writeback.h>
19#include <linux/atomic.h> 19#include <linux/atomic.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/workqueue.h>
21 22
22struct page; 23struct page;
23struct device; 24struct device;
@@ -27,7 +28,6 @@ struct dentry;
27 * Bits in backing_dev_info.state 28 * Bits in backing_dev_info.state
28 */ 29 */
29enum bdi_state { 30enum bdi_state {
30 BDI_pending, /* On its way to being activated */
31 BDI_wb_alloc, /* Default embedded wb allocated */ 31 BDI_wb_alloc, /* Default embedded wb allocated */
32 BDI_async_congested, /* The async (write) queue is getting full */ 32 BDI_async_congested, /* The async (write) queue is getting full */
33 BDI_sync_congested, /* The sync queue is getting full */ 33 BDI_sync_congested, /* The sync queue is getting full */
@@ -53,10 +53,8 @@ struct bdi_writeback {
53 unsigned int nr; 53 unsigned int nr;
54 54
55 unsigned long last_old_flush; /* last old data flush */ 55 unsigned long last_old_flush; /* last old data flush */
56 unsigned long last_active; /* last time bdi thread was active */
57 56
58 struct task_struct *task; /* writeback thread */ 57 struct delayed_work dwork; /* work item used for writeback */
59 struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
60 struct list_head b_dirty; /* dirty inodes */ 58 struct list_head b_dirty; /* dirty inodes */
61 struct list_head b_io; /* parked for writeback */ 59 struct list_head b_io; /* parked for writeback */
62 struct list_head b_more_io; /* parked for more writeback */ 60 struct list_head b_more_io; /* parked for more writeback */
@@ -123,7 +121,7 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
123void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 121void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
124 enum wb_reason reason); 122 enum wb_reason reason);
125void bdi_start_background_writeback(struct backing_dev_info *bdi); 123void bdi_start_background_writeback(struct backing_dev_info *bdi);
126int bdi_writeback_thread(void *data); 124void bdi_writeback_workfn(struct work_struct *work);
127int bdi_has_dirty_io(struct backing_dev_info *bdi); 125int bdi_has_dirty_io(struct backing_dev_info *bdi);
128void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); 126void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
129void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); 127void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
@@ -131,6 +129,8 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
131extern spinlock_t bdi_lock; 129extern spinlock_t bdi_lock;
132extern struct list_head bdi_list; 130extern struct list_head bdi_list;
133 131
132extern struct workqueue_struct *bdi_wq;
133
134static inline int wb_has_dirty_io(struct bdi_writeback *wb) 134static inline int wb_has_dirty_io(struct bdi_writeback *wb)
135{ 135{
136 return !list_empty(&wb->b_dirty) || 136 return !list_empty(&wb->b_dirty) ||
@@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
335 return bdi->capabilities & BDI_CAP_SWAP_BACKED; 335 return bdi->capabilities & BDI_CAP_SWAP_BACKED;
336} 336}
337 337
338static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
339{
340 return bdi == &default_backing_dev_info;
341}
342
343static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) 338static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
344{ 339{
345 return bdi_cap_writeback_dirty(mapping->backing_dev_info); 340 return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 6a16fd2e70ed..464ea82e10db 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class,
183DEFINE_EVENT(writeback_work_class, name, \ 183DEFINE_EVENT(writeback_work_class, name, \
184 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ 184 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
185 TP_ARGS(bdi, work)) 185 TP_ARGS(bdi, work))
186DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
187DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); 186DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
188DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); 187DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
189DEFINE_WRITEBACK_WORK_EVENT(writeback_start); 188DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \
222 221
223DEFINE_WRITEBACK_EVENT(writeback_nowork); 222DEFINE_WRITEBACK_EVENT(writeback_nowork);
224DEFINE_WRITEBACK_EVENT(writeback_wake_background); 223DEFINE_WRITEBACK_EVENT(writeback_wake_background);
225DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
226DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
227DEFINE_WRITEBACK_EVENT(writeback_bdi_register); 224DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
228DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); 225DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
229DEFINE_WRITEBACK_EVENT(writeback_thread_start);
230DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
231 226
232DECLARE_EVENT_CLASS(wbc_class, 227DECLARE_EVENT_CLASS(wbc_class,
233 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), 228 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 657569b3fcf6..2857d4f6bca4 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -37,6 +37,9 @@ static struct class *bdi_class;
37DEFINE_SPINLOCK(bdi_lock); 37DEFINE_SPINLOCK(bdi_lock);
38LIST_HEAD(bdi_list); 38LIST_HEAD(bdi_list);
39 39
40/* bdi_wq serves all asynchronous writeback tasks */
41struct workqueue_struct *bdi_wq;
42
40void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 43void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
41{ 44{
42 if (wb1 < wb2) { 45 if (wb1 < wb2) {
@@ -255,6 +258,11 @@ static int __init default_bdi_init(void)
255{ 258{
256 int err; 259 int err;
257 260
261 bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
262 WQ_UNBOUND, 0);
263 if (!bdi_wq)
264 return -ENOMEM;
265
258 err = bdi_init(&default_backing_dev_info); 266 err = bdi_init(&default_backing_dev_info);
259 if (!err) 267 if (!err)
260 bdi_register(&default_backing_dev_info, NULL, "default"); 268 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -269,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
269 return wb_has_dirty_io(&bdi->wb); 277 return wb_has_dirty_io(&bdi->wb);
270} 278}
271 279
272static void wakeup_timer_fn(unsigned long data)
273{
274 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
275
276 spin_lock_bh(&bdi->wb_lock);
277 if (bdi->wb.task) {
278 trace_writeback_wake_thread(bdi);
279 wake_up_process(bdi->wb.task);
280 } else if (bdi->dev) {
281 /*
282 * When bdi tasks are inactive for long time, they are killed.
283 * In this case we have to wake-up the forker thread which
284 * should create and run the bdi thread.
285 */
286 trace_writeback_wake_forker_thread(bdi);
287 wake_up_process(default_backing_dev_info.wb.task);
288 }
289 spin_unlock_bh(&bdi->wb_lock);
290}
291
292/* 280/*
293 * This function is used when the first inode for this bdi is marked dirty. It 281 * This function is used when the first inode for this bdi is marked dirty. It
294 * wakes-up the corresponding bdi thread which should then take care of the 282 * wakes-up the corresponding bdi thread which should then take care of the
@@ -305,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
305 unsigned long timeout; 293 unsigned long timeout;
306 294
307 timeout = msecs_to_jiffies(dirty_writeback_interval * 10); 295 timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
308 mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); 296 mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
309}
310
311/*
312 * Calculate the longest interval (jiffies) bdi threads are allowed to be
313 * inactive.
314 */
315static unsigned long bdi_longest_inactive(void)
316{
317 unsigned long interval;
318
319 interval = msecs_to_jiffies(dirty_writeback_interval * 10);
320 return max(5UL * 60 * HZ, interval);
321}
322
323/*
324 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
325 * shutdown
326 */
327static void bdi_clear_pending(struct backing_dev_info *bdi)
328{
329 clear_bit(BDI_pending, &bdi->state);
330 smp_mb__after_clear_bit();
331 wake_up_bit(&bdi->state, BDI_pending);
332}
333
334static int bdi_forker_thread(void *ptr)
335{
336 struct bdi_writeback *me = ptr;
337
338 current->flags |= PF_SWAPWRITE;
339 set_freezable();
340
341 /*
342 * Our parent may run at a different priority, just set us to normal
343 */
344 set_user_nice(current, 0);
345
346 for (;;) {
347 struct task_struct *task = NULL;
348 struct backing_dev_info *bdi;
349 enum {
350 NO_ACTION, /* Nothing to do */
351 FORK_THREAD, /* Fork bdi thread */
352 KILL_THREAD, /* Kill inactive bdi thread */
353 } action = NO_ACTION;
354
355 /*
356 * Temporary measure, we want to make sure we don't see
357 * dirty data on the default backing_dev_info
358 */
359 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
360 del_timer(&me->wakeup_timer);
361 wb_do_writeback(me, 0);
362 }
363
364 spin_lock_bh(&bdi_lock);
365 /*
366 * In the following loop we are going to check whether we have
367 * some work to do without any synchronization with tasks
368 * waking us up to do work for them. Set the task state here
369 * so that we don't miss wakeups after verifying conditions.
370 */
371 set_current_state(TASK_INTERRUPTIBLE);
372
373 list_for_each_entry(bdi, &bdi_list, bdi_list) {
374 bool have_dirty_io;
375
376 if (!bdi_cap_writeback_dirty(bdi) ||
377 bdi_cap_flush_forker(bdi))
378 continue;
379
380 WARN(!test_bit(BDI_registered, &bdi->state),
381 "bdi %p/%s is not registered!\n", bdi, bdi->name);
382
383 have_dirty_io = !list_empty(&bdi->work_list) ||
384 wb_has_dirty_io(&bdi->wb);
385
386 /*
387 * If the bdi has work to do, but the thread does not
388 * exist - create it.
389 */
390 if (!bdi->wb.task && have_dirty_io) {
391 /*
392 * Set the pending bit - if someone will try to
393 * unregister this bdi - it'll wait on this bit.
394 */
395 set_bit(BDI_pending, &bdi->state);
396 action = FORK_THREAD;
397 break;
398 }
399
400 spin_lock(&bdi->wb_lock);
401
402 /*
403 * If there is no work to do and the bdi thread was
404 * inactive long enough - kill it. The wb_lock is taken
405 * to make sure no-one adds more work to this bdi and
406 * wakes the bdi thread up.
407 */
408 if (bdi->wb.task && !have_dirty_io &&
409 time_after(jiffies, bdi->wb.last_active +
410 bdi_longest_inactive())) {
411 task = bdi->wb.task;
412 bdi->wb.task = NULL;
413 spin_unlock(&bdi->wb_lock);
414 set_bit(BDI_pending, &bdi->state);
415 action = KILL_THREAD;
416 break;
417 }
418 spin_unlock(&bdi->wb_lock);
419 }
420 spin_unlock_bh(&bdi_lock);
421
422 /* Keep working if default bdi still has things to do */
423 if (!list_empty(&me->bdi->work_list))
424 __set_current_state(TASK_RUNNING);
425
426 switch (action) {
427 case FORK_THREAD:
428 __set_current_state(TASK_RUNNING);
429 task = kthread_create(bdi_writeback_thread, &bdi->wb,
430 "flush-%s", dev_name(bdi->dev));
431 if (IS_ERR(task)) {
432 /*
433 * If thread creation fails, force writeout of
434 * the bdi from the thread. Hopefully 1024 is
435 * large enough for efficient IO.
436 */
437 writeback_inodes_wb(&bdi->wb, 1024,
438 WB_REASON_FORKER_THREAD);
439 } else {
440 /*
441 * The spinlock makes sure we do not lose
442 * wake-ups when racing with 'bdi_queue_work()'.
443 * And as soon as the bdi thread is visible, we
444 * can start it.
445 */
446 spin_lock_bh(&bdi->wb_lock);
447 bdi->wb.task = task;
448 spin_unlock_bh(&bdi->wb_lock);
449 wake_up_process(task);
450 }
451 bdi_clear_pending(bdi);
452 break;
453
454 case KILL_THREAD:
455 __set_current_state(TASK_RUNNING);
456 kthread_stop(task);
457 bdi_clear_pending(bdi);
458 break;
459
460 case NO_ACTION:
461 if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
462 /*
463 * There are no dirty data. The only thing we
464 * should now care about is checking for
465 * inactive bdi threads and killing them. Thus,
466 * let's sleep for longer time, save energy and
467 * be friendly for battery-driven devices.
468 */
469 schedule_timeout(bdi_longest_inactive());
470 else
471 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
472 try_to_freeze();
473 break;
474 }
475 }
476
477 return 0;
478} 297}
479 298
480/* 299/*
@@ -487,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
487 spin_unlock_bh(&bdi_lock); 306 spin_unlock_bh(&bdi_lock);
488 307
489 synchronize_rcu_expedited(); 308 synchronize_rcu_expedited();
309
310 /* bdi_list is now unused, clear it to mark @bdi dying */
311 INIT_LIST_HEAD(&bdi->bdi_list);
490} 312}
491 313
492int bdi_register(struct backing_dev_info *bdi, struct device *parent, 314int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -506,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
506 328
507 bdi->dev = dev; 329 bdi->dev = dev;
508 330
509 /*
510 * Just start the forker thread for our default backing_dev_info,
511 * and add other bdi's to the list. They will get a thread created
512 * on-demand when they need it.
513 */
514 if (bdi_cap_flush_forker(bdi)) {
515 struct bdi_writeback *wb = &bdi->wb;
516
517 wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
518 dev_name(dev));
519 if (IS_ERR(wb->task))
520 return PTR_ERR(wb->task);
521 }
522
523 bdi_debug_register(bdi, dev_name(dev)); 331 bdi_debug_register(bdi, dev_name(dev));
524 set_bit(BDI_registered, &bdi->state); 332 set_bit(BDI_registered, &bdi->state);
525 333
@@ -543,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
543 */ 351 */
544static void bdi_wb_shutdown(struct backing_dev_info *bdi) 352static void bdi_wb_shutdown(struct backing_dev_info *bdi)
545{ 353{
546 struct task_struct *task;
547
548 if (!bdi_cap_writeback_dirty(bdi)) 354 if (!bdi_cap_writeback_dirty(bdi))
549 return; 355 return;
550 356
@@ -554,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
554 bdi_remove_from_list(bdi); 360 bdi_remove_from_list(bdi);
555 361
556 /* 362 /*
557 * If setup is pending, wait for that to complete first 363 * Drain work list and shutdown the delayed_work. At this point,
364 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
365 * is dying and its work_list needs to be drained no matter what.
558 */ 366 */
559 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, 367 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
560 TASK_UNINTERRUPTIBLE); 368 flush_delayed_work(&bdi->wb.dwork);
369 WARN_ON(!list_empty(&bdi->work_list));
561 370
562 /* 371 /*
563 * Finally, kill the kernel thread. We don't need to be RCU 372 * This shouldn't be necessary unless @bdi for some reason has
564 * safe anymore, since the bdi is gone from visibility. 373 * unflushed dirty IO after work_list is drained. Do it anyway
374 * just in case.
565 */ 375 */
566 spin_lock_bh(&bdi->wb_lock); 376 cancel_delayed_work_sync(&bdi->wb.dwork);
567 task = bdi->wb.task;
568 bdi->wb.task = NULL;
569 spin_unlock_bh(&bdi->wb_lock);
570
571 if (task)
572 kthread_stop(task);
573} 377}
574 378
575/* 379/*
@@ -595,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
595 bdi_set_min_ratio(bdi, 0); 399 bdi_set_min_ratio(bdi, 0);
596 trace_writeback_bdi_unregister(bdi); 400 trace_writeback_bdi_unregister(bdi);
597 bdi_prune_sb(bdi); 401 bdi_prune_sb(bdi);
598 del_timer_sync(&bdi->wb.wakeup_timer);
599 402
600 if (!bdi_cap_flush_forker(bdi)) 403 bdi_wb_shutdown(bdi);
601 bdi_wb_shutdown(bdi);
602 bdi_debug_unregister(bdi); 404 bdi_debug_unregister(bdi);
603 405
604 spin_lock_bh(&bdi->wb_lock); 406 spin_lock_bh(&bdi->wb_lock);
@@ -620,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
620 INIT_LIST_HEAD(&wb->b_io); 422 INIT_LIST_HEAD(&wb->b_io);
621 INIT_LIST_HEAD(&wb->b_more_io); 423 INIT_LIST_HEAD(&wb->b_more_io);
622 spin_lock_init(&wb->list_lock); 424 spin_lock_init(&wb->list_lock);
623 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 425 INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
624} 426}
625 427
626/* 428/*
@@ -693,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
693 bdi_unregister(bdi); 495 bdi_unregister(bdi);
694 496
695 /* 497 /*
696 * If bdi_unregister() had already been called earlier, the 498 * If bdi_unregister() had already been called earlier, the dwork
697 * wakeup_timer could still be armed because bdi_prune_sb() 499 * could still be pending because bdi_prune_sb() can race with the
698 * can race with the bdi_wakeup_thread_delayed() calls from 500 * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
699 * __mark_inode_dirty().
700 */ 501 */
701 del_timer_sync(&bdi->wb.wakeup_timer); 502 cancel_delayed_work_sync(&bdi->wb.dwork);
702 503
703 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 504 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
704 percpu_counter_destroy(&bdi->bdi_stat[i]); 505 percpu_counter_destroy(&bdi->bdi_stat[i]);