writeback: replace custom worker pool implementation with unbound workqueue

Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Jens Axboe <axboe@kernel.dk> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Jeff Moyer <jmoyer@redhat.com>
author: Tejun Heo <tj@kernel.org> 2013-04-01 22:08:06 -0400
committer: Tejun Heo <tj@kernel.org> 2013-04-01 22:08:06 -0400
commit: 839a8e8660b6777e7fe4e80af1a048aebe2b5977 (patch)
tree: 80398cd4dd8ebc4c51be20725c0cc427bfe321b3 /fs/fs-writeback.c
parent: 181387da2d64c3129e5b5186c4dd388bc5041d53 (diff)
1 files changed, 32 insertions, 70 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 21f46fb3a101..8067d3719e94 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
-#include <linux/freezer.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head)
 #define CREATE_TRACE_POINTS
 #include <trace/events/writeback.h>
-/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
-{
-        if (bdi->wb.task) {
-                wake_up_process(bdi->wb.task);
-        } else {
-                /*
-                 * The bdi thread isn't there, wake up the forker thread which
-                 * will create and run it.
-                 */
-                wake_up_process(default_backing_dev_info.wb.task);
-        }
-}
 static void bdi_queue_work(struct backing_dev_info *bdi,
                           struct wb_writeback_work *work)
 {
@@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
        spin_lock_bh(&bdi->wb_lock);
        list_add_tail(&work->list, &bdi->work_list);
-        if (!bdi->wb.task)
-                trace_writeback_nothread(bdi, work);
-        bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
+        mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 }
 static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
         */
        work = kzalloc(sizeof(*work), GFP_ATOMIC);
        if (!work) {
-                if (bdi->wb.task) {
+                trace_writeback_nowork(bdi);
-                        trace_writeback_nowork(bdi);
+                mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-                        wake_up_process(bdi->wb.task);
-                }
                return;
        }
@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(bdi);
-        spin_lock_bh(&bdi->wb_lock);
+        mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-        bdi_wakeup_flusher(bdi);
-        spin_unlock_bh(&bdi->wb_lock);
 }
 /*
@@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 /*
 * Handle writeback of dirty data for the device backed by this bdi. Also
- * wakes up periodically and does kupdated style flushing.
+ * reschedules periodically and does kupdated style flushing.
 */
-int bdi_writeback_thread(void *data)
+void bdi_writeback_workfn(struct work_struct *work)
 {
-        struct bdi_writeback *wb = data;
+        struct bdi_writeback *wb = container_of(to_delayed_work(work),
+                                                struct bdi_writeback, dwork);
        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
        current->flags |= PF_SWAPWRITE;
-        set_freezable();
-        wb->last_active = jiffies;
-        /*
-         * Our parent may run at a different priority, just set us to normal
-         */
-        set_user_nice(current, 0);
-        trace_writeback_thread_start(bdi);
-        while (!kthread_freezable_should_stop(NULL)) {
+        if (likely(!current_is_workqueue_rescuer() ||
+                   list_empty(&bdi->bdi_list))) {
                /*
-                 * Remove own delayed wake-up timer, since we are already awake
+                 * The normal path.  Keep writing back @bdi until its
-                 * and we'll take care of the periodic write-back.
+                 * work_list is empty.  Note that this path is also taken
+                 * if @bdi is shutting down even when we're running off the
+                 * rescuer as work_list needs to be drained.
                 */
-                del_timer(&wb->wakeup_timer);
+                do {
+                        pages_written = wb_do_writeback(wb, 0);
-                pages_written = wb_do_writeback(wb, 0);
+                        trace_writeback_pages_written(pages_written);
+                } while (!list_empty(&bdi->work_list));
+        } else {
+                /*
+                 * bdi_wq can't get enough workers and we're running off
+                 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
+                 * enough for efficient IO.
+                 */
+                pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+                                                    WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
-                if (pages_written)
-                        wb->last_active = jiffies;
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
-                        __set_current_state(TASK_RUNNING);
-                        continue;
-                }
-                if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-                        schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-                else {
-                        /*
-                         * We have nothing to do, so can go sleep without any
-                         * timeout and save power. When a work is queued or
-                         * something is made dirty - we will be woken up.
-                         */
-                        schedule();
-                }
        }
-        /* Flush any work that raced with us exiting */
+        if (!list_empty(&bdi->work_list) ||
-        if (!list_empty(&bdi->work_list))
+            (wb_has_dirty_io(wb) && dirty_writeback_interval))
-                wb_do_writeback(wb, 1);
+                queue_delayed_work(bdi_wq, &wb->dwork,
+                        msecs_to_jiffies(dirty_writeback_interval * 10));
-        trace_writeback_thread_stop(bdi);
+        current->flags &= ~PF_SWAPWRITE;
-        return 0;
 }
 /*
 * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 * the whole world.
author	Tejun Heo <tj@kernel.org>	2013-04-01 22:08:06 -0400
committer	Tejun Heo <tj@kernel.org>	2013-04-01 22:08:06 -0400
commit	839a8e8660b6777e7fe4e80af1a048aebe2b5977 (patch)
tree	80398cd4dd8ebc4c51be20725c0cc427bfe321b3 /fs/fs-writeback.c
parent	181387da2d64c3129e5b5186c4dd388bc5041d53 (diff)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..8067d3719e94 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
22	#include <linux/mm.h>	22	#include <linux/mm.h>
23	#include <linux/pagemap.h>	23	#include <linux/pagemap.h>
24	#include <linux/kthread.h>	24	#include <linux/kthread.h>
25	#include <linux/freezer.h>
26	#include <linux/writeback.h>	25	#include <linux/writeback.h>
27	#include <linux/blkdev.h>	26	#include <linux/blkdev.h>
28	#include <linux/backing-dev.h>	27	#include <linux/backing-dev.h>
@@ -88,20 +87,6 @@ static inline struct inode wb_inode(struct list_head head)
88	#define CREATE_TRACE_POINTS	87	#define CREATE_TRACE_POINTS
89	#include <trace/events/writeback.h>	88	#include <trace/events/writeback.h>
90		89
91	/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
92	static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
93	{
94	if (bdi->wb.task) {
95	wake_up_process(bdi->wb.task);
96	} else {
97	/*
98	* The bdi thread isn't there, wake up the forker thread which
99	* will create and run it.
100	*/
101	wake_up_process(default_backing_dev_info.wb.task);
102	}
103	}
104
105	static void bdi_queue_work(struct backing_dev_info *bdi,	90	static void bdi_queue_work(struct backing_dev_info *bdi,
106	struct wb_writeback_work *work)	91	struct wb_writeback_work *work)
107	{	92	{
@@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
109		94
110	spin_lock_bh(&bdi->wb_lock);	95	spin_lock_bh(&bdi->wb_lock);
111	list_add_tail(&work->list, &bdi->work_list);	96	list_add_tail(&work->list, &bdi->work_list);
112	if (!bdi->wb.task)
113	trace_writeback_nothread(bdi, work);
114	bdi_wakeup_flusher(bdi);
115	spin_unlock_bh(&bdi->wb_lock);	97	spin_unlock_bh(&bdi->wb_lock);
		98
		99	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
116	}	100	}
117		101
118	static void	102	static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
127	*/	111	*/
128	work = kzalloc(sizeof(*work), GFP_ATOMIC);	112	work = kzalloc(sizeof(*work), GFP_ATOMIC);
129	if (!work) {	113	if (!work) {
130	if (bdi->wb.task) {	114	trace_writeback_nowork(bdi);
131	trace_writeback_nowork(bdi);	115	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
132	wake_up_process(bdi->wb.task);
133	}
134	return;	116	return;
135	}	117	}
136		118
@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
177	* writeback as soon as there is no other work to do.	159	* writeback as soon as there is no other work to do.
178	*/	160	*/
179	trace_writeback_wake_background(bdi);	161	trace_writeback_wake_background(bdi);
180	spin_lock_bh(&bdi->wb_lock);	162	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
181	bdi_wakeup_flusher(bdi);
182	spin_unlock_bh(&bdi->wb_lock);
183	}	163	}
184		164
185	/*	165	/*
@@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
1020		1000
1021	/*	1001	/*
1022	* Handle writeback of dirty data for the device backed by this bdi. Also	1002	* Handle writeback of dirty data for the device backed by this bdi. Also
1023	* wakes up periodically and does kupdated style flushing.	1003	* reschedules periodically and does kupdated style flushing.
1024	*/	1004	*/
1025	int bdi_writeback_thread(void *data)	1005	void bdi_writeback_workfn(struct work_struct *work)
1026	{	1006	{
1027	struct bdi_writeback *wb = data;	1007	struct bdi_writeback *wb = container_of(to_delayed_work(work),
		1008	struct bdi_writeback, dwork);
1028	struct backing_dev_info *bdi = wb->bdi;	1009	struct backing_dev_info *bdi = wb->bdi;
1029	long pages_written;	1010	long pages_written;
1030		1011
1031	current->flags \|= PF_SWAPWRITE;	1012	current->flags \|= PF_SWAPWRITE;
1032	set_freezable();
1033	wb->last_active = jiffies;
1034
1035	/*
1036	* Our parent may run at a different priority, just set us to normal
1037	*/
1038	set_user_nice(current, 0);
1039
1040	trace_writeback_thread_start(bdi);
1041		1013
1042	while (!kthread_freezable_should_stop(NULL)) {	1014	if (likely(!current_is_workqueue_rescuer() \|\|
		1015	list_empty(&bdi->bdi_list))) {
1043	/*	1016	/*
1044	* Remove own delayed wake-up timer, since we are already awake	1017	* The normal path. Keep writing back @bdi until its
1045	* and we'll take care of the periodic write-back.	1018	* work_list is empty. Note that this path is also taken
		1019	* if @bdi is shutting down even when we're running off the
		1020	* rescuer as work_list needs to be drained.
1046	*/	1021	*/
1047	del_timer(&wb->wakeup_timer);	1022	do {
1048		1023	pages_written = wb_do_writeback(wb, 0);
1049	pages_written = wb_do_writeback(wb, 0);	1024	trace_writeback_pages_written(pages_written);
1050		1025	} while (!list_empty(&bdi->work_list));
		1026	} else {
		1027	/*
		1028	* bdi_wq can't get enough workers and we're running off
		1029	* the emergency worker. Don't hog it. Hopefully, 1024 is
		1030	* enough for efficient IO.
		1031	*/
		1032	pages_written = writeback_inodes_wb(&bdi->wb, 1024,
		1033	WB_REASON_FORKER_THREAD);
1051	trace_writeback_pages_written(pages_written);	1034	trace_writeback_pages_written(pages_written);
1052
1053	if (pages_written)
1054	wb->last_active = jiffies;
1055
1056	set_current_state(TASK_INTERRUPTIBLE);
1057	if (!list_empty(&bdi->work_list) \|\| kthread_should_stop()) {
1058	__set_current_state(TASK_RUNNING);
1059	continue;
1060	}
1061
1062	if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1063	schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
1064	else {
1065	/*
1066	* We have nothing to do, so can go sleep without any
1067	* timeout and save power. When a work is queued or
1068	* something is made dirty - we will be woken up.
1069	*/
1070	schedule();
1071	}
1072	}	1035	}
1073		1036
1074	/* Flush any work that raced with us exiting */	1037	if (!list_empty(&bdi->work_list) \|\|
1075	if (!list_empty(&bdi->work_list))	1038	(wb_has_dirty_io(wb) && dirty_writeback_interval))
1076	wb_do_writeback(wb, 1);	1039	queue_delayed_work(bdi_wq, &wb->dwork,
		1040	msecs_to_jiffies(dirty_writeback_interval * 10));
1077		1041
1078	trace_writeback_thread_stop(bdi);	1042	current->flags &= ~PF_SWAPWRITE;
1079	return 0;
1080	}	1043	}
1081		1044
1082
1083	/*	1045	/*
1084	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back	1046	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
1085	* the whole world.	1047	* the whole world.