aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c641
1 files changed, 279 insertions, 362 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..2f76c4a081a2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,62 +26,38 @@
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/tracepoint.h>
29#include "internal.h" 30#include "internal.h"
30 31
31#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
32
33/*
34 * We don't actually have pdflush, but this one is exported though /proc...
35 */
36int nr_pdflush_threads;
37
38/* 32/*
39 * Passed into wb_writeback(), essentially a subset of writeback_control 33 * Passed into wb_writeback(), essentially a subset of writeback_control
40 */ 34 */
41struct wb_writeback_args { 35struct wb_writeback_work {
42 long nr_pages; 36 long nr_pages;
43 struct super_block *sb; 37 struct super_block *sb;
44 enum writeback_sync_modes sync_mode; 38 enum writeback_sync_modes sync_mode;
45 int for_kupdate:1; 39 unsigned int for_kupdate:1;
46 int range_cyclic:1; 40 unsigned int range_cyclic:1;
47 int for_background:1; 41 unsigned int for_background:1;
48};
49 42
50/*
51 * Work items for the bdi_writeback threads
52 */
53struct bdi_work {
54 struct list_head list; /* pending work list */ 43 struct list_head list; /* pending work list */
55 struct rcu_head rcu_head; /* for RCU free/clear of work */ 44 struct completion *done; /* set if the caller waits */
56
57 unsigned long seen; /* threads that have seen this work */
58 atomic_t pending; /* number of threads still to do work */
59
60 struct wb_writeback_args args; /* writeback arguments */
61
62 unsigned long state; /* flag bits, see WS_* */
63};
64
65enum {
66 WS_USED_B = 0,
67 WS_ONSTACK_B,
68}; 45};
69 46
70#define WS_USED (1 << WS_USED_B) 47/*
71#define WS_ONSTACK (1 << WS_ONSTACK_B) 48 * Include the creation of the trace points after defining the
49 * wb_writeback_work structure so that the definition remains local to this
50 * file.
51 */
52#define CREATE_TRACE_POINTS
53#include <trace/events/writeback.h>
72 54
73static inline bool bdi_work_on_stack(struct bdi_work *work) 55#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
74{
75 return test_bit(WS_ONSTACK_B, &work->state);
76}
77 56
78static inline void bdi_work_init(struct bdi_work *work, 57/*
79 struct wb_writeback_args *args) 58 * We don't actually have pdflush, but this one is exported though /proc...
80{ 59 */
81 INIT_RCU_HEAD(&work->rcu_head); 60int nr_pdflush_threads;
82 work->args = *args;
83 work->state = WS_USED;
84}
85 61
86/** 62/**
87 * writeback_in_progress - determine whether there is writeback in progress 63 * writeback_in_progress - determine whether there is writeback in progress
@@ -95,183 +71,81 @@ int writeback_in_progress(struct backing_dev_info *bdi)
95 return !list_empty(&bdi->work_list); 71 return !list_empty(&bdi->work_list);
96} 72}
97 73
98static void bdi_work_clear(struct bdi_work *work) 74static void bdi_queue_work(struct backing_dev_info *bdi,
99{ 75 struct wb_writeback_work *work)
100 clear_bit(WS_USED_B, &work->state);
101 smp_mb__after_clear_bit();
102 /*
103 * work can have disappeared at this point. bit waitq functions
104 * should be able to tolerate this, provided bdi_sched_wait does
105 * not dereference it's pointer argument.
106 */
107 wake_up_bit(&work->state, WS_USED_B);
108}
109
110static void bdi_work_free(struct rcu_head *head)
111{
112 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
113
114 if (!bdi_work_on_stack(work))
115 kfree(work);
116 else
117 bdi_work_clear(work);
118}
119
120static void wb_work_complete(struct bdi_work *work)
121{
122 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
123 int onstack = bdi_work_on_stack(work);
124
125 /*
126 * For allocated work, we can clear the done/seen bit right here.
127 * For on-stack work, we need to postpone both the clear and free
128 * to after the RCU grace period, since the stack could be invalidated
129 * as soon as bdi_work_clear() has done the wakeup.
130 */
131 if (!onstack)
132 bdi_work_clear(work);
133 if (sync_mode == WB_SYNC_NONE || onstack)
134 call_rcu(&work->rcu_head, bdi_work_free);
135}
136
137static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
138{
139 /*
140 * The caller has retrieved the work arguments from this work,
141 * drop our reference. If this is the last ref, delete and free it
142 */
143 if (atomic_dec_and_test(&work->pending)) {
144 struct backing_dev_info *bdi = wb->bdi;
145
146 spin_lock(&bdi->wb_lock);
147 list_del_rcu(&work->list);
148 spin_unlock(&bdi->wb_lock);
149
150 wb_work_complete(work);
151 }
152}
153
154static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
155{ 76{
156 work->seen = bdi->wb_mask; 77 trace_writeback_queue(bdi, work);
157 BUG_ON(!work->seen);
158 atomic_set(&work->pending, bdi->wb_cnt);
159 BUG_ON(!bdi->wb_cnt);
160 78
161 /* 79 spin_lock_bh(&bdi->wb_lock);
162 * list_add_tail_rcu() contains the necessary barriers to 80 list_add_tail(&work->list, &bdi->work_list);
163 * make sure the above stores are seen before the item is 81 if (bdi->wb.task) {
164 * noticed on the list 82 wake_up_process(bdi->wb.task);
165 */ 83 } else {
166 spin_lock(&bdi->wb_lock); 84 /*
167 list_add_tail_rcu(&work->list, &bdi->work_list); 85 * The bdi thread isn't there, wake up the forker thread which
168 spin_unlock(&bdi->wb_lock); 86 * will create and run it.
169 87 */
170 /* 88 trace_writeback_nothread(bdi, work);
171 * If the default thread isn't there, make sure we add it. When
172 * it gets created and wakes up, we'll run this work.
173 */
174 if (unlikely(list_empty_careful(&bdi->wb_list)))
175 wake_up_process(default_backing_dev_info.wb.task); 89 wake_up_process(default_backing_dev_info.wb.task);
176 else {
177 struct bdi_writeback *wb = &bdi->wb;
178
179 if (wb->task)
180 wake_up_process(wb->task);
181 } 90 }
91 spin_unlock_bh(&bdi->wb_lock);
182} 92}
183 93
184/* 94static void
185 * Used for on-stack allocated work items. The caller needs to wait until 95__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
186 * the wb threads have acked the work before it's safe to continue. 96 bool range_cyclic, bool for_background)
187 */
188static void bdi_wait_on_work_clear(struct bdi_work *work)
189{
190 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
191 TASK_UNINTERRUPTIBLE);
192}
193
194static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
195 struct wb_writeback_args *args)
196{ 97{
197 struct bdi_work *work; 98 struct wb_writeback_work *work;
198 99
199 /* 100 /*
200 * This is WB_SYNC_NONE writeback, so if allocation fails just 101 * This is WB_SYNC_NONE writeback, so if allocation fails just
201 * wakeup the thread for old dirty data writeback 102 * wakeup the thread for old dirty data writeback
202 */ 103 */
203 work = kmalloc(sizeof(*work), GFP_ATOMIC); 104 work = kzalloc(sizeof(*work), GFP_ATOMIC);
204 if (work) { 105 if (!work) {
205 bdi_work_init(work, args); 106 if (bdi->wb.task) {
206 bdi_queue_work(bdi, work); 107 trace_writeback_nowork(bdi);
207 } else { 108 wake_up_process(bdi->wb.task);
208 struct bdi_writeback *wb = &bdi->wb; 109 }
209 110 return;
210 if (wb->task)
211 wake_up_process(wb->task);
212 } 111 }
112
113 work->sync_mode = WB_SYNC_NONE;
114 work->nr_pages = nr_pages;
115 work->range_cyclic = range_cyclic;
116 work->for_background = for_background;
117
118 bdi_queue_work(bdi, work);
213} 119}
214 120
215/** 121/**
216 * bdi_sync_writeback - start and wait for writeback 122 * bdi_start_writeback - start writeback
217 * @bdi: the backing device to write from 123 * @bdi: the backing device to write from
218 * @sb: write inodes from this super_block 124 * @nr_pages: the number of pages to write
219 * 125 *
220 * Description: 126 * Description:
221 * This does WB_SYNC_ALL data integrity writeback and waits for the 127 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
222 * IO to complete. Callers must hold the sb s_umount semaphore for 128 * started when this function returns, we make no guarentees on
223 * reading, to avoid having the super disappear before we are done. 129 * completion. Caller need not hold sb s_umount semaphore.
130 *
224 */ 131 */
225static void bdi_sync_writeback(struct backing_dev_info *bdi, 132void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
226 struct super_block *sb)
227{ 133{
228 struct wb_writeback_args args = { 134 __bdi_start_writeback(bdi, nr_pages, true, false);
229 .sb = sb,
230 .sync_mode = WB_SYNC_ALL,
231 .nr_pages = LONG_MAX,
232 .range_cyclic = 0,
233 };
234 struct bdi_work work;
235
236 bdi_work_init(&work, &args);
237 work.state |= WS_ONSTACK;
238
239 bdi_queue_work(bdi, &work);
240 bdi_wait_on_work_clear(&work);
241} 135}
242 136
243/** 137/**
244 * bdi_start_writeback - start writeback 138 * bdi_start_background_writeback - start background writeback
245 * @bdi: the backing device to write from 139 * @bdi: the backing device to write from
246 * @sb: write inodes from this super_block
247 * @nr_pages: the number of pages to write
248 * 140 *
249 * Description: 141 * Description:
250 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 142 * This does WB_SYNC_NONE background writeback. The IO is only
251 * started when this function returns, we make no guarentees on 143 * started when this function returns, we make no guarentees on
252 * completion. Caller need not hold sb s_umount semaphore. 144 * completion. Caller need not hold sb s_umount semaphore.
253 *
254 */ 145 */
255void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 146void bdi_start_background_writeback(struct backing_dev_info *bdi)
256 long nr_pages)
257{ 147{
258 struct wb_writeback_args args = { 148 __bdi_start_writeback(bdi, LONG_MAX, true, true);
259 .sb = sb,
260 .sync_mode = WB_SYNC_NONE,
261 .nr_pages = nr_pages,
262 .range_cyclic = 1,
263 };
264
265 /*
266 * We treat @nr_pages=0 as the special case to do background writeback,
267 * ie. to sync pages until the background dirty threshold is reached.
268 */
269 if (!nr_pages) {
270 args.nr_pages = LONG_MAX;
271 args.for_background = 1;
272 }
273
274 bdi_alloc_queue_work(bdi, &args);
275} 149}
276 150
277/* 151/*
@@ -398,11 +272,11 @@ static void inode_wait_for_writeback(struct inode *inode)
398 wait_queue_head_t *wqh; 272 wait_queue_head_t *wqh;
399 273
400 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 274 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
401 do { 275 while (inode->i_state & I_SYNC) {
402 spin_unlock(&inode_lock); 276 spin_unlock(&inode_lock);
403 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 277 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
404 spin_lock(&inode_lock); 278 spin_lock(&inode_lock);
405 } while (inode->i_state & I_SYNC); 279 }
406} 280}
407 281
408/* 282/*
@@ -452,11 +326,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
452 326
453 BUG_ON(inode->i_state & I_SYNC); 327 BUG_ON(inode->i_state & I_SYNC);
454 328
455 /* Set I_SYNC, reset I_DIRTY */ 329 /* Set I_SYNC, reset I_DIRTY_PAGES */
456 dirty = inode->i_state & I_DIRTY;
457 inode->i_state |= I_SYNC; 330 inode->i_state |= I_SYNC;
458 inode->i_state &= ~I_DIRTY; 331 inode->i_state &= ~I_DIRTY_PAGES;
459
460 spin_unlock(&inode_lock); 332 spin_unlock(&inode_lock);
461 333
462 ret = do_writepages(mapping, wbc); 334 ret = do_writepages(mapping, wbc);
@@ -472,6 +344,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
472 ret = err; 344 ret = err;
473 } 345 }
474 346
347 /*
348 * Some filesystems may redirty the inode during the writeback
349 * due to delalloc, clear dirty metadata flags right before
350 * write_inode()
351 */
352 spin_lock(&inode_lock);
353 dirty = inode->i_state & I_DIRTY;
354 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
355 spin_unlock(&inode_lock);
475 /* Don't write the inode if only I_DIRTY_PAGES was set */ 356 /* Don't write the inode if only I_DIRTY_PAGES was set */
476 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 357 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
477 int err = write_inode(inode, wbc); 358 int err = write_inode(inode, wbc);
@@ -481,7 +362,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
481 362
482 spin_lock(&inode_lock); 363 spin_lock(&inode_lock);
483 inode->i_state &= ~I_SYNC; 364 inode->i_state &= ~I_SYNC;
484 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 365 if (!(inode->i_state & I_FREEING)) {
485 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { 366 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
486 /* 367 /*
487 * More pages get dirtied by a fast dirtier. 368 * More pages get dirtied by a fast dirtier.
@@ -554,75 +435,69 @@ select_queue:
554 return ret; 435 return ret;
555} 436}
556 437
557static void unpin_sb_for_writeback(struct super_block *sb)
558{
559 up_read(&sb->s_umount);
560 put_super(sb);
561}
562
563enum sb_pin_state {
564 SB_PINNED,
565 SB_NOT_PINNED,
566 SB_PIN_FAILED
567};
568
569/* 438/*
570 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 439 * For background writeback the caller does not have the sb pinned
571 * before calling writeback. So make sure that we do pin it, so it doesn't 440 * before calling writeback. So make sure that we do pin it, so it doesn't
572 * go away while we are writing inodes from it. 441 * go away while we are writing inodes from it.
573 */ 442 */
574static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, 443static bool pin_sb_for_writeback(struct super_block *sb)
575 struct super_block *sb)
576{ 444{
577 /*
578 * Caller must already hold the ref for this
579 */
580 if (wbc->sync_mode == WB_SYNC_ALL) {
581 WARN_ON(!rwsem_is_locked(&sb->s_umount));
582 return SB_NOT_PINNED;
583 }
584 spin_lock(&sb_lock); 445 spin_lock(&sb_lock);
446 if (list_empty(&sb->s_instances)) {
447 spin_unlock(&sb_lock);
448 return false;
449 }
450
585 sb->s_count++; 451 sb->s_count++;
452 spin_unlock(&sb_lock);
453
586 if (down_read_trylock(&sb->s_umount)) { 454 if (down_read_trylock(&sb->s_umount)) {
587 if (sb->s_root) { 455 if (sb->s_root)
588 spin_unlock(&sb_lock); 456 return true;
589 return SB_PINNED;
590 }
591 /*
592 * umounted, drop rwsem again and fall through to failure
593 */
594 up_read(&sb->s_umount); 457 up_read(&sb->s_umount);
595 } 458 }
596 sb->s_count--; 459
597 spin_unlock(&sb_lock); 460 put_super(sb);
598 return SB_PIN_FAILED; 461 return false;
599} 462}
600 463
601/* 464/*
602 * Write a portion of b_io inodes which belong to @sb. 465 * Write a portion of b_io inodes which belong to @sb.
603 * If @wbc->sb != NULL, then find and write all such 466 *
467 * If @only_this_sb is true, then find and write all such
604 * inodes. Otherwise write only ones which go sequentially 468 * inodes. Otherwise write only ones which go sequentially
605 * in reverse order. 469 * in reverse order.
470 *
606 * Return 1, if the caller writeback routine should be 471 * Return 1, if the caller writeback routine should be
607 * interrupted. Otherwise return 0. 472 * interrupted. Otherwise return 0.
608 */ 473 */
609static int writeback_sb_inodes(struct super_block *sb, 474static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
610 struct bdi_writeback *wb, 475 struct writeback_control *wbc, bool only_this_sb)
611 struct writeback_control *wbc)
612{ 476{
613 while (!list_empty(&wb->b_io)) { 477 while (!list_empty(&wb->b_io)) {
614 long pages_skipped; 478 long pages_skipped;
615 struct inode *inode = list_entry(wb->b_io.prev, 479 struct inode *inode = list_entry(wb->b_io.prev,
616 struct inode, i_list); 480 struct inode, i_list);
617 if (wbc->sb && sb != inode->i_sb) { 481
618 /* super block given and doesn't 482 if (inode->i_sb != sb) {
619 match, skip this inode */ 483 if (only_this_sb) {
620 redirty_tail(inode); 484 /*
621 continue; 485 * We only want to write back data for this
622 } 486 * superblock, move all inodes not belonging
623 if (sb != inode->i_sb) 487 * to it back onto the dirty list.
624 /* finish with this superblock */ 488 */
489 redirty_tail(inode);
490 continue;
491 }
492
493 /*
494 * The inode belongs to a different superblock.
495 * Bounce back to the caller to unpin this and
496 * pin the next superblock.
497 */
625 return 0; 498 return 0;
499 }
500
626 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 501 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
627 requeue_io(inode); 502 requeue_io(inode);
628 continue; 503 continue;
@@ -634,7 +509,7 @@ static int writeback_sb_inodes(struct super_block *sb,
634 if (inode_dirtied_after(inode, wbc->wb_start)) 509 if (inode_dirtied_after(inode, wbc->wb_start))
635 return 1; 510 return 1;
636 511
637 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 512 BUG_ON(inode->i_state & I_FREEING);
638 __iget(inode); 513 __iget(inode);
639 pages_skipped = wbc->pages_skipped; 514 pages_skipped = wbc->pages_skipped;
640 writeback_single_inode(inode, wbc); 515 writeback_single_inode(inode, wbc);
@@ -660,12 +535,13 @@ static int writeback_sb_inodes(struct super_block *sb,
660 return 1; 535 return 1;
661} 536}
662 537
663static void writeback_inodes_wb(struct bdi_writeback *wb, 538void writeback_inodes_wb(struct bdi_writeback *wb,
664 struct writeback_control *wbc) 539 struct writeback_control *wbc)
665{ 540{
666 int ret = 0; 541 int ret = 0;
667 542
668 wbc->wb_start = jiffies; /* livelock avoidance */ 543 if (!wbc->wb_start)
544 wbc->wb_start = jiffies; /* livelock avoidance */
669 spin_lock(&inode_lock); 545 spin_lock(&inode_lock);
670 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 546 if (!wbc->for_kupdate || list_empty(&wb->b_io))
671 queue_io(wb, wbc->older_than_this); 547 queue_io(wb, wbc->older_than_this);
@@ -674,24 +550,14 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
674 struct inode *inode = list_entry(wb->b_io.prev, 550 struct inode *inode = list_entry(wb->b_io.prev,
675 struct inode, i_list); 551 struct inode, i_list);
676 struct super_block *sb = inode->i_sb; 552 struct super_block *sb = inode->i_sb;
677 enum sb_pin_state state;
678
679 if (wbc->sb && sb != wbc->sb) {
680 /* super block given and doesn't
681 match, skip this inode */
682 redirty_tail(inode);
683 continue;
684 }
685 state = pin_sb_for_writeback(wbc, sb);
686 553
687 if (state == SB_PIN_FAILED) { 554 if (!pin_sb_for_writeback(sb)) {
688 requeue_io(inode); 555 requeue_io(inode);
689 continue; 556 continue;
690 } 557 }
691 ret = writeback_sb_inodes(sb, wb, wbc); 558 ret = writeback_sb_inodes(sb, wb, wbc, false);
559 drop_super(sb);
692 560
693 if (state == SB_PINNED)
694 unpin_sb_for_writeback(sb);
695 if (ret) 561 if (ret)
696 break; 562 break;
697 } 563 }
@@ -699,11 +565,16 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
699 /* Leave any unwritten inodes on b_io */ 565 /* Leave any unwritten inodes on b_io */
700} 566}
701 567
702void writeback_inodes_wbc(struct writeback_control *wbc) 568static void __writeback_inodes_sb(struct super_block *sb,
569 struct bdi_writeback *wb, struct writeback_control *wbc)
703{ 570{
704 struct backing_dev_info *bdi = wbc->bdi; 571 WARN_ON(!rwsem_is_locked(&sb->s_umount));
705 572
706 writeback_inodes_wb(&bdi->wb, wbc); 573 spin_lock(&inode_lock);
574 if (!wbc->for_kupdate || list_empty(&wb->b_io))
575 queue_io(wb, wbc->older_than_this);
576 writeback_sb_inodes(sb, wb, wbc, true);
577 spin_unlock(&inode_lock);
707} 578}
708 579
709/* 580/*
@@ -741,16 +612,14 @@ static inline bool over_bground_thresh(void)
741 * all dirty pages if they are all attached to "old" mappings. 612 * all dirty pages if they are all attached to "old" mappings.
742 */ 613 */
743static long wb_writeback(struct bdi_writeback *wb, 614static long wb_writeback(struct bdi_writeback *wb,
744 struct wb_writeback_args *args) 615 struct wb_writeback_work *work)
745{ 616{
746 struct writeback_control wbc = { 617 struct writeback_control wbc = {
747 .bdi = wb->bdi, 618 .sync_mode = work->sync_mode,
748 .sb = args->sb,
749 .sync_mode = args->sync_mode,
750 .older_than_this = NULL, 619 .older_than_this = NULL,
751 .for_kupdate = args->for_kupdate, 620 .for_kupdate = work->for_kupdate,
752 .for_background = args->for_background, 621 .for_background = work->for_background,
753 .range_cyclic = args->range_cyclic, 622 .range_cyclic = work->range_cyclic,
754 }; 623 };
755 unsigned long oldest_jif; 624 unsigned long oldest_jif;
756 long wrote = 0; 625 long wrote = 0;
@@ -766,25 +635,33 @@ static long wb_writeback(struct bdi_writeback *wb,
766 wbc.range_end = LLONG_MAX; 635 wbc.range_end = LLONG_MAX;
767 } 636 }
768 637
638 wbc.wb_start = jiffies; /* livelock avoidance */
769 for (;;) { 639 for (;;) {
770 /* 640 /*
771 * Stop writeback when nr_pages has been consumed 641 * Stop writeback when nr_pages has been consumed
772 */ 642 */
773 if (args->nr_pages <= 0) 643 if (work->nr_pages <= 0)
774 break; 644 break;
775 645
776 /* 646 /*
777 * For background writeout, stop when we are below the 647 * For background writeout, stop when we are below the
778 * background dirty threshold 648 * background dirty threshold
779 */ 649 */
780 if (args->for_background && !over_bground_thresh()) 650 if (work->for_background && !over_bground_thresh())
781 break; 651 break;
782 652
783 wbc.more_io = 0; 653 wbc.more_io = 0;
784 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 654 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
785 wbc.pages_skipped = 0; 655 wbc.pages_skipped = 0;
786 writeback_inodes_wb(wb, &wbc); 656
787 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 657 trace_wbc_writeback_start(&wbc, wb->bdi);
658 if (work->sb)
659 __writeback_inodes_sb(work->sb, wb, &wbc);
660 else
661 writeback_inodes_wb(wb, &wbc);
662 trace_wbc_writeback_written(&wbc, wb->bdi);
663
664 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
788 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 665 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
789 666
790 /* 667 /*
@@ -811,6 +688,7 @@ static long wb_writeback(struct bdi_writeback *wb,
811 if (!list_empty(&wb->b_more_io)) { 688 if (!list_empty(&wb->b_more_io)) {
812 inode = list_entry(wb->b_more_io.prev, 689 inode = list_entry(wb->b_more_io.prev,
813 struct inode, i_list); 690 struct inode, i_list);
691 trace_wbc_writeback_wait(&wbc, wb->bdi);
814 inode_wait_for_writeback(inode); 692 inode_wait_for_writeback(inode);
815 } 693 }
816 spin_unlock(&inode_lock); 694 spin_unlock(&inode_lock);
@@ -820,31 +698,21 @@ static long wb_writeback(struct bdi_writeback *wb,
820} 698}
821 699
822/* 700/*
823 * Return the next bdi_work struct that hasn't been processed by this 701 * Return the next wb_writeback_work struct that hasn't been processed yet.
824 * wb thread yet. ->seen is initially set for each thread that exists
825 * for this device, when a thread first notices a piece of work it
826 * clears its bit. Depending on writeback type, the thread will notify
827 * completion on either receiving the work (WB_SYNC_NONE) or after
828 * it is done (WB_SYNC_ALL).
829 */ 702 */
830static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, 703static struct wb_writeback_work *
831 struct bdi_writeback *wb) 704get_next_work_item(struct backing_dev_info *bdi)
832{ 705{
833 struct bdi_work *work, *ret = NULL; 706 struct wb_writeback_work *work = NULL;
834
835 rcu_read_lock();
836
837 list_for_each_entry_rcu(work, &bdi->work_list, list) {
838 if (!test_bit(wb->nr, &work->seen))
839 continue;
840 clear_bit(wb->nr, &work->seen);
841 707
842 ret = work; 708 spin_lock_bh(&bdi->wb_lock);
843 break; 709 if (!list_empty(&bdi->work_list)) {
710 work = list_entry(bdi->work_list.next,
711 struct wb_writeback_work, list);
712 list_del_init(&work->list);
844 } 713 }
845 714 spin_unlock_bh(&bdi->wb_lock);
846 rcu_read_unlock(); 715 return work;
847 return ret;
848} 716}
849 717
850static long wb_check_old_data_flush(struct bdi_writeback *wb) 718static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -852,6 +720,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
852 unsigned long expired; 720 unsigned long expired;
853 long nr_pages; 721 long nr_pages;
854 722
723 /*
724 * When set to zero, disable periodic writeback
725 */
726 if (!dirty_writeback_interval)
727 return 0;
728
855 expired = wb->last_old_flush + 729 expired = wb->last_old_flush +
856 msecs_to_jiffies(dirty_writeback_interval * 10); 730 msecs_to_jiffies(dirty_writeback_interval * 10);
857 if (time_before(jiffies, expired)) 731 if (time_before(jiffies, expired))
@@ -863,14 +737,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
863 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 737 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
864 738
865 if (nr_pages) { 739 if (nr_pages) {
866 struct wb_writeback_args args = { 740 struct wb_writeback_work work = {
867 .nr_pages = nr_pages, 741 .nr_pages = nr_pages,
868 .sync_mode = WB_SYNC_NONE, 742 .sync_mode = WB_SYNC_NONE,
869 .for_kupdate = 1, 743 .for_kupdate = 1,
870 .range_cyclic = 1, 744 .range_cyclic = 1,
871 }; 745 };
872 746
873 return wb_writeback(wb, &args); 747 return wb_writeback(wb, &work);
874 } 748 }
875 749
876 return 0; 750 return 0;
@@ -882,33 +756,29 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
882long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 756long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
883{ 757{
884 struct backing_dev_info *bdi = wb->bdi; 758 struct backing_dev_info *bdi = wb->bdi;
885 struct bdi_work *work; 759 struct wb_writeback_work *work;
886 long wrote = 0; 760 long wrote = 0;
887 761
888 while ((work = get_next_work_item(bdi, wb)) != NULL) { 762 while ((work = get_next_work_item(bdi)) != NULL) {
889 struct wb_writeback_args args = work->args;
890
891 /* 763 /*
892 * Override sync mode, in case we must wait for completion 764 * Override sync mode, in case we must wait for completion
765 * because this thread is exiting now.
893 */ 766 */
894 if (force_wait) 767 if (force_wait)
895 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 768 work->sync_mode = WB_SYNC_ALL;
896 769
897 /* 770 trace_writeback_exec(bdi, work);
898 * If this isn't a data integrity operation, just notify
899 * that we have seen this work and we are now starting it.
900 */
901 if (args.sync_mode == WB_SYNC_NONE)
902 wb_clear_pending(wb, work);
903 771
904 wrote += wb_writeback(wb, &args); 772 wrote += wb_writeback(wb, work);
905 773
906 /* 774 /*
907 * This is a data integrity writeback, so only do the 775 * Notify the caller of completion if this is a synchronous
908 * notification when we have completed the work. 776 * work item, otherwise just free it.
909 */ 777 */
910 if (args.sync_mode == WB_SYNC_ALL) 778 if (work->done)
911 wb_clear_pending(wb, work); 779 complete(work->done);
780 else
781 kfree(work);
912 } 782 }
913 783
914 /* 784 /*
@@ -923,75 +793,88 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
923 * Handle writeback of dirty data for the device backed by this bdi. Also 793 * Handle writeback of dirty data for the device backed by this bdi. Also
924 * wakes up periodically and does kupdated style flushing. 794 * wakes up periodically and does kupdated style flushing.
925 */ 795 */
926int bdi_writeback_task(struct bdi_writeback *wb) 796int bdi_writeback_thread(void *data)
927{ 797{
928 unsigned long last_active = jiffies; 798 struct bdi_writeback *wb = data;
929 unsigned long wait_jiffies = -1UL; 799 struct backing_dev_info *bdi = wb->bdi;
930 long pages_written; 800 long pages_written;
931 801
802 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
803 set_freezable();
804 wb->last_active = jiffies;
805
806 /*
807 * Our parent may run at a different priority, just set us to normal
808 */
809 set_user_nice(current, 0);
810
811 trace_writeback_thread_start(bdi);
812
932 while (!kthread_should_stop()) { 813 while (!kthread_should_stop()) {
814 /*
815 * Remove own delayed wake-up timer, since we are already awake
816 * and we'll take care of the preriodic write-back.
817 */
818 del_timer(&wb->wakeup_timer);
819
933 pages_written = wb_do_writeback(wb, 0); 820 pages_written = wb_do_writeback(wb, 0);
934 821
822 trace_writeback_pages_written(pages_written);
823
935 if (pages_written) 824 if (pages_written)
936 last_active = jiffies; 825 wb->last_active = jiffies;
937 else if (wait_jiffies != -1UL) { 826
938 unsigned long max_idle; 827 set_current_state(TASK_INTERRUPTIBLE);
828 if (!list_empty(&bdi->work_list)) {
829 __set_current_state(TASK_RUNNING);
830 continue;
831 }
939 832
833 if (wb_has_dirty_io(wb) && dirty_writeback_interval)
834 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
835 else {
940 /* 836 /*
941 * Longest period of inactivity that we tolerate. If we 837 * We have nothing to do, so can go sleep without any
942 * see dirty data again later, the task will get 838 * timeout and save power. When a work is queued or
943 * recreated automatically. 839 * something is made dirty - we will be woken up.
944 */ 840 */
945 max_idle = max(5UL * 60 * HZ, wait_jiffies); 841 schedule();
946 if (time_after(jiffies, max_idle + last_active))
947 break;
948 } 842 }
949 843
950 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
951 schedule_timeout_interruptible(wait_jiffies);
952 try_to_freeze(); 844 try_to_freeze();
953 } 845 }
954 846
847 /* Flush any work that raced with us exiting */
848 if (!list_empty(&bdi->work_list))
849 wb_do_writeback(wb, 1);
850
851 trace_writeback_thread_stop(bdi);
955 return 0; 852 return 0;
956} 853}
957 854
855
958/* 856/*
959 * Schedule writeback for all backing devices. This does WB_SYNC_NONE 857 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
960 * writeback, for integrity writeback see bdi_sync_writeback(). 858 * the whole world.
961 */ 859 */
962static void bdi_writeback_all(struct super_block *sb, long nr_pages) 860void wakeup_flusher_threads(long nr_pages)
963{ 861{
964 struct wb_writeback_args args = {
965 .sb = sb,
966 .nr_pages = nr_pages,
967 .sync_mode = WB_SYNC_NONE,
968 };
969 struct backing_dev_info *bdi; 862 struct backing_dev_info *bdi;
970 863
971 rcu_read_lock(); 864 if (!nr_pages) {
865 nr_pages = global_page_state(NR_FILE_DIRTY) +
866 global_page_state(NR_UNSTABLE_NFS);
867 }
972 868
869 rcu_read_lock();
973 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 870 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
974 if (!bdi_has_dirty_io(bdi)) 871 if (!bdi_has_dirty_io(bdi))
975 continue; 872 continue;
976 873 __bdi_start_writeback(bdi, nr_pages, false, false);
977 bdi_alloc_queue_work(bdi, &args);
978 } 874 }
979
980 rcu_read_unlock(); 875 rcu_read_unlock();
981} 876}
982 877
983/*
984 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
985 * the whole world.
986 */
987void wakeup_flusher_threads(long nr_pages)
988{
989 if (nr_pages == 0)
990 nr_pages = global_page_state(NR_FILE_DIRTY) +
991 global_page_state(NR_UNSTABLE_NFS);
992 bdi_writeback_all(NULL, nr_pages);
993}
994
995static noinline void block_dump___mark_inode_dirty(struct inode *inode) 878static noinline void block_dump___mark_inode_dirty(struct inode *inode)
996{ 879{
997 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 880 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1044,6 +927,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1044void __mark_inode_dirty(struct inode *inode, int flags) 927void __mark_inode_dirty(struct inode *inode, int flags)
1045{ 928{
1046 struct super_block *sb = inode->i_sb; 929 struct super_block *sb = inode->i_sb;
930 struct backing_dev_info *bdi = NULL;
931 bool wakeup_bdi = false;
1047 932
1048 /* 933 /*
1049 * Don't do this for I_DIRTY_PAGES - that doesn't actually 934 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1089,7 +974,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1089 if (hlist_unhashed(&inode->i_hash)) 974 if (hlist_unhashed(&inode->i_hash))
1090 goto out; 975 goto out;
1091 } 976 }
1092 if (inode->i_state & (I_FREEING|I_CLEAR)) 977 if (inode->i_state & I_FREEING)
1093 goto out; 978 goto out;
1094 979
1095 /* 980 /*
@@ -1097,22 +982,31 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1097 * reposition it (that would break b_dirty time-ordering). 982 * reposition it (that would break b_dirty time-ordering).
1098 */ 983 */
1099 if (!was_dirty) { 984 if (!was_dirty) {
1100 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 985 bdi = inode_to_bdi(inode);
1101 struct backing_dev_info *bdi = wb->bdi; 986
1102 987 if (bdi_cap_writeback_dirty(bdi)) {
1103 if (bdi_cap_writeback_dirty(bdi) && 988 WARN(!test_bit(BDI_registered, &bdi->state),
1104 !test_bit(BDI_registered, &bdi->state)) { 989 "bdi-%s not registered\n", bdi->name);
1105 WARN_ON(1); 990
1106 printk(KERN_ERR "bdi-%s not registered\n", 991 /*
1107 bdi->name); 992 * If this is the first dirty inode for this
993 * bdi, we have to wake-up the corresponding
994 * bdi thread to make sure background
995 * write-back happens later.
996 */
997 if (!wb_has_dirty_io(&bdi->wb))
998 wakeup_bdi = true;
1108 } 999 }
1109 1000
1110 inode->dirtied_when = jiffies; 1001 inode->dirtied_when = jiffies;
1111 list_move(&inode->i_list, &wb->b_dirty); 1002 list_move(&inode->i_list, &bdi->wb.b_dirty);
1112 } 1003 }
1113 } 1004 }
1114out: 1005out:
1115 spin_unlock(&inode_lock); 1006 spin_unlock(&inode_lock);
1007
1008 if (wakeup_bdi)
1009 bdi_wakeup_thread_delayed(bdi);
1116} 1010}
1117EXPORT_SYMBOL(__mark_inode_dirty); 1011EXPORT_SYMBOL(__mark_inode_dirty);
1118 1012
@@ -1155,7 +1049,7 @@ static void wait_sb_inodes(struct super_block *sb)
1155 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1049 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1156 struct address_space *mapping; 1050 struct address_space *mapping;
1157 1051
1158 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 1052 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
1159 continue; 1053 continue;
1160 mapping = inode->i_mapping; 1054 mapping = inode->i_mapping;
1161 if (mapping->nrpages == 0) 1055 if (mapping->nrpages == 0)
@@ -1196,12 +1090,20 @@ void writeback_inodes_sb(struct super_block *sb)
1196{ 1090{
1197 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1091 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1198 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1092 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1199 long nr_to_write; 1093 DECLARE_COMPLETION_ONSTACK(done);
1094 struct wb_writeback_work work = {
1095 .sb = sb,
1096 .sync_mode = WB_SYNC_NONE,
1097 .done = &done,
1098 };
1099
1100 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1200 1101
1201 nr_to_write = nr_dirty + nr_unstable + 1102 work.nr_pages = nr_dirty + nr_unstable +
1202 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1103 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1203 1104
1204 bdi_start_writeback(sb->s_bdi, sb, nr_to_write); 1105 bdi_queue_work(sb->s_bdi, &work);
1106 wait_for_completion(&done);
1205} 1107}
1206EXPORT_SYMBOL(writeback_inodes_sb); 1108EXPORT_SYMBOL(writeback_inodes_sb);
1207 1109
@@ -1215,7 +1117,9 @@ EXPORT_SYMBOL(writeback_inodes_sb);
1215int writeback_inodes_sb_if_idle(struct super_block *sb) 1117int writeback_inodes_sb_if_idle(struct super_block *sb)
1216{ 1118{
1217 if (!writeback_in_progress(sb->s_bdi)) { 1119 if (!writeback_in_progress(sb->s_bdi)) {
1120 down_read(&sb->s_umount);
1218 writeback_inodes_sb(sb); 1121 writeback_inodes_sb(sb);
1122 up_read(&sb->s_umount);
1219 return 1; 1123 return 1;
1220 } else 1124 } else
1221 return 0; 1125 return 0;
@@ -1231,7 +1135,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1231 */ 1135 */
1232void sync_inodes_sb(struct super_block *sb) 1136void sync_inodes_sb(struct super_block *sb)
1233{ 1137{
1234 bdi_sync_writeback(sb->s_bdi, sb); 1138 DECLARE_COMPLETION_ONSTACK(done);
1139 struct wb_writeback_work work = {
1140 .sb = sb,
1141 .sync_mode = WB_SYNC_ALL,
1142 .nr_pages = LONG_MAX,
1143 .range_cyclic = 0,
1144 .done = &done,
1145 };
1146
1147 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1148
1149 bdi_queue_work(sb->s_bdi, &work);
1150 wait_for_completion(&done);
1151
1235 wait_sb_inodes(sb); 1152 wait_sb_inodes(sb);
1236} 1153}
1237EXPORT_SYMBOL(sync_inodes_sb); 1154EXPORT_SYMBOL(sync_inodes_sb);