diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 341 |
1 files changed, 203 insertions, 138 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d5be1693ac93..aed881a76b22 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -26,15 +26,9 @@ | |||
26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/buffer_head.h> | 28 | #include <linux/buffer_head.h> |
29 | #include <linux/tracepoint.h> | ||
29 | #include "internal.h" | 30 | #include "internal.h" |
30 | 31 | ||
31 | #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) | ||
32 | |||
33 | /* | ||
34 | * We don't actually have pdflush, but this one is exported though /proc... | ||
35 | */ | ||
36 | int nr_pdflush_threads; | ||
37 | |||
38 | /* | 32 | /* |
39 | * Passed into wb_writeback(), essentially a subset of writeback_control | 33 | * Passed into wb_writeback(), essentially a subset of writeback_control |
40 | */ | 34 | */ |
@@ -50,6 +44,19 @@ struct wb_writeback_work { | |||
50 | struct completion *done; /* set if the caller waits */ | 44 | struct completion *done; /* set if the caller waits */ |
51 | }; | 45 | }; |
52 | 46 | ||
47 | /* | ||
48 | * Include the creation of the trace points after defining the | ||
49 | * wb_writeback_work structure so that the definition remains local to this | ||
50 | * file. | ||
51 | */ | ||
52 | #define CREATE_TRACE_POINTS | ||
53 | #include <trace/events/writeback.h> | ||
54 | |||
55 | /* | ||
56 | * We don't actually have pdflush, but this one is exported though /proc... | ||
57 | */ | ||
58 | int nr_pdflush_threads; | ||
59 | |||
53 | /** | 60 | /** |
54 | * writeback_in_progress - determine whether there is writeback in progress | 61 | * writeback_in_progress - determine whether there is writeback in progress |
55 | * @bdi: the device's backing_dev_info structure. | 62 | * @bdi: the device's backing_dev_info structure. |
@@ -59,28 +66,42 @@ struct wb_writeback_work { | |||
59 | */ | 66 | */ |
60 | int writeback_in_progress(struct backing_dev_info *bdi) | 67 | int writeback_in_progress(struct backing_dev_info *bdi) |
61 | { | 68 | { |
62 | return !list_empty(&bdi->work_list); | 69 | return test_bit(BDI_writeback_running, &bdi->state); |
70 | } | ||
71 | |||
72 | static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) | ||
73 | { | ||
74 | struct super_block *sb = inode->i_sb; | ||
75 | |||
76 | if (strcmp(sb->s_type->name, "bdev") == 0) | ||
77 | return inode->i_mapping->backing_dev_info; | ||
78 | |||
79 | return sb->s_bdi; | ||
80 | } | ||
81 | |||
82 | static inline struct inode *wb_inode(struct list_head *head) | ||
83 | { | ||
84 | return list_entry(head, struct inode, i_wb_list); | ||
63 | } | 85 | } |
64 | 86 | ||
65 | static void bdi_queue_work(struct backing_dev_info *bdi, | 87 | static void bdi_queue_work(struct backing_dev_info *bdi, |
66 | struct wb_writeback_work *work) | 88 | struct wb_writeback_work *work) |
67 | { | 89 | { |
68 | spin_lock(&bdi->wb_lock); | 90 | trace_writeback_queue(bdi, work); |
69 | list_add_tail(&work->list, &bdi->work_list); | ||
70 | spin_unlock(&bdi->wb_lock); | ||
71 | 91 | ||
72 | /* | 92 | spin_lock_bh(&bdi->wb_lock); |
73 | * If the default thread isn't there, make sure we add it. When | 93 | list_add_tail(&work->list, &bdi->work_list); |
74 | * it gets created and wakes up, we'll run this work. | 94 | if (bdi->wb.task) { |
75 | */ | 95 | wake_up_process(bdi->wb.task); |
76 | if (unlikely(list_empty_careful(&bdi->wb_list))) | 96 | } else { |
97 | /* | ||
98 | * The bdi thread isn't there, wake up the forker thread which | ||
99 | * will create and run it. | ||
100 | */ | ||
101 | trace_writeback_nothread(bdi, work); | ||
77 | wake_up_process(default_backing_dev_info.wb.task); | 102 | wake_up_process(default_backing_dev_info.wb.task); |
78 | else { | ||
79 | struct bdi_writeback *wb = &bdi->wb; | ||
80 | |||
81 | if (wb->task) | ||
82 | wake_up_process(wb->task); | ||
83 | } | 103 | } |
104 | spin_unlock_bh(&bdi->wb_lock); | ||
84 | } | 105 | } |
85 | 106 | ||
86 | static void | 107 | static void |
@@ -95,8 +116,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, | |||
95 | */ | 116 | */ |
96 | work = kzalloc(sizeof(*work), GFP_ATOMIC); | 117 | work = kzalloc(sizeof(*work), GFP_ATOMIC); |
97 | if (!work) { | 118 | if (!work) { |
98 | if (bdi->wb.task) | 119 | if (bdi->wb.task) { |
120 | trace_writeback_nowork(bdi); | ||
99 | wake_up_process(bdi->wb.task); | 121 | wake_up_process(bdi->wb.task); |
122 | } | ||
100 | return; | 123 | return; |
101 | } | 124 | } |
102 | 125 | ||
@@ -154,11 +177,11 @@ static void redirty_tail(struct inode *inode) | |||
154 | if (!list_empty(&wb->b_dirty)) { | 177 | if (!list_empty(&wb->b_dirty)) { |
155 | struct inode *tail; | 178 | struct inode *tail; |
156 | 179 | ||
157 | tail = list_entry(wb->b_dirty.next, struct inode, i_list); | 180 | tail = wb_inode(wb->b_dirty.next); |
158 | if (time_before(inode->dirtied_when, tail->dirtied_when)) | 181 | if (time_before(inode->dirtied_when, tail->dirtied_when)) |
159 | inode->dirtied_when = jiffies; | 182 | inode->dirtied_when = jiffies; |
160 | } | 183 | } |
161 | list_move(&inode->i_list, &wb->b_dirty); | 184 | list_move(&inode->i_wb_list, &wb->b_dirty); |
162 | } | 185 | } |
163 | 186 | ||
164 | /* | 187 | /* |
@@ -168,7 +191,7 @@ static void requeue_io(struct inode *inode) | |||
168 | { | 191 | { |
169 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 192 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
170 | 193 | ||
171 | list_move(&inode->i_list, &wb->b_more_io); | 194 | list_move(&inode->i_wb_list, &wb->b_more_io); |
172 | } | 195 | } |
173 | 196 | ||
174 | static void inode_sync_complete(struct inode *inode) | 197 | static void inode_sync_complete(struct inode *inode) |
@@ -209,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
209 | int do_sb_sort = 0; | 232 | int do_sb_sort = 0; |
210 | 233 | ||
211 | while (!list_empty(delaying_queue)) { | 234 | while (!list_empty(delaying_queue)) { |
212 | inode = list_entry(delaying_queue->prev, struct inode, i_list); | 235 | inode = wb_inode(delaying_queue->prev); |
213 | if (older_than_this && | 236 | if (older_than_this && |
214 | inode_dirtied_after(inode, *older_than_this)) | 237 | inode_dirtied_after(inode, *older_than_this)) |
215 | break; | 238 | break; |
216 | if (sb && sb != inode->i_sb) | 239 | if (sb && sb != inode->i_sb) |
217 | do_sb_sort = 1; | 240 | do_sb_sort = 1; |
218 | sb = inode->i_sb; | 241 | sb = inode->i_sb; |
219 | list_move(&inode->i_list, &tmp); | 242 | list_move(&inode->i_wb_list, &tmp); |
220 | } | 243 | } |
221 | 244 | ||
222 | /* just one sb in list, splice to dispatch_queue and we're done */ | 245 | /* just one sb in list, splice to dispatch_queue and we're done */ |
@@ -227,22 +250,29 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
227 | 250 | ||
228 | /* Move inodes from one superblock together */ | 251 | /* Move inodes from one superblock together */ |
229 | while (!list_empty(&tmp)) { | 252 | while (!list_empty(&tmp)) { |
230 | inode = list_entry(tmp.prev, struct inode, i_list); | 253 | sb = wb_inode(tmp.prev)->i_sb; |
231 | sb = inode->i_sb; | ||
232 | list_for_each_prev_safe(pos, node, &tmp) { | 254 | list_for_each_prev_safe(pos, node, &tmp) { |
233 | inode = list_entry(pos, struct inode, i_list); | 255 | inode = wb_inode(pos); |
234 | if (inode->i_sb == sb) | 256 | if (inode->i_sb == sb) |
235 | list_move(&inode->i_list, dispatch_queue); | 257 | list_move(&inode->i_wb_list, dispatch_queue); |
236 | } | 258 | } |
237 | } | 259 | } |
238 | } | 260 | } |
239 | 261 | ||
240 | /* | 262 | /* |
241 | * Queue all expired dirty inodes for io, eldest first. | 263 | * Queue all expired dirty inodes for io, eldest first. |
264 | * Before | ||
265 | * newly dirtied b_dirty b_io b_more_io | ||
266 | * =============> gf edc BA | ||
267 | * After | ||
268 | * newly dirtied b_dirty b_io b_more_io | ||
269 | * =============> g fBAedc | ||
270 | * | | ||
271 | * +--> dequeue for IO | ||
242 | */ | 272 | */ |
243 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) | 273 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) |
244 | { | 274 | { |
245 | list_splice_init(&wb->b_more_io, wb->b_io.prev); | 275 | list_splice_init(&wb->b_more_io, &wb->b_io); |
246 | move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | 276 | move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); |
247 | } | 277 | } |
248 | 278 | ||
@@ -352,73 +382,43 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
352 | 382 | ||
353 | spin_lock(&inode_lock); | 383 | spin_lock(&inode_lock); |
354 | inode->i_state &= ~I_SYNC; | 384 | inode->i_state &= ~I_SYNC; |
355 | if (!(inode->i_state & (I_FREEING | I_CLEAR))) { | 385 | if (!(inode->i_state & I_FREEING)) { |
356 | if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { | 386 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { |
357 | /* | ||
358 | * More pages get dirtied by a fast dirtier. | ||
359 | */ | ||
360 | goto select_queue; | ||
361 | } else if (inode->i_state & I_DIRTY) { | ||
362 | /* | ||
363 | * At least XFS will redirty the inode during the | ||
364 | * writeback (delalloc) and on io completion (isize). | ||
365 | */ | ||
366 | redirty_tail(inode); | ||
367 | } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | ||
368 | /* | 387 | /* |
369 | * We didn't write back all the pages. nfs_writepages() | 388 | * We didn't write back all the pages. nfs_writepages() |
370 | * sometimes bales out without doing anything. Redirty | 389 | * sometimes bales out without doing anything. |
371 | * the inode; Move it from b_io onto b_more_io/b_dirty. | ||
372 | */ | 390 | */ |
373 | /* | 391 | inode->i_state |= I_DIRTY_PAGES; |
374 | * akpm: if the caller was the kupdate function we put | 392 | if (wbc->nr_to_write <= 0) { |
375 | * this inode at the head of b_dirty so it gets first | ||
376 | * consideration. Otherwise, move it to the tail, for | ||
377 | * the reasons described there. I'm not really sure | ||
378 | * how much sense this makes. Presumably I had a good | ||
379 | * reasons for doing it this way, and I'd rather not | ||
380 | * muck with it at present. | ||
381 | */ | ||
382 | if (wbc->for_kupdate) { | ||
383 | /* | 393 | /* |
384 | * For the kupdate function we move the inode | 394 | * slice used up: queue for next turn |
385 | * to b_more_io so it will get more writeout as | ||
386 | * soon as the queue becomes uncongested. | ||
387 | */ | 395 | */ |
388 | inode->i_state |= I_DIRTY_PAGES; | 396 | requeue_io(inode); |
389 | select_queue: | ||
390 | if (wbc->nr_to_write <= 0) { | ||
391 | /* | ||
392 | * slice used up: queue for next turn | ||
393 | */ | ||
394 | requeue_io(inode); | ||
395 | } else { | ||
396 | /* | ||
397 | * somehow blocked: retry later | ||
398 | */ | ||
399 | redirty_tail(inode); | ||
400 | } | ||
401 | } else { | 397 | } else { |
402 | /* | 398 | /* |
403 | * Otherwise fully redirty the inode so that | 399 | * Writeback blocked by something other than |
404 | * other inodes on this superblock will get some | 400 | * congestion. Delay the inode for some time to |
405 | * writeout. Otherwise heavy writing to one | 401 | * avoid spinning on the CPU (100% iowait) |
406 | * file would indefinitely suspend writeout of | 402 | * retrying writeback of the dirty page/inode |
407 | * all the other files. | 403 | * that cannot be performed immediately. |
408 | */ | 404 | */ |
409 | inode->i_state |= I_DIRTY_PAGES; | ||
410 | redirty_tail(inode); | 405 | redirty_tail(inode); |
411 | } | 406 | } |
412 | } else if (atomic_read(&inode->i_count)) { | 407 | } else if (inode->i_state & I_DIRTY) { |
413 | /* | 408 | /* |
414 | * The inode is clean, inuse | 409 | * Filesystems can dirty the inode during writeback |
410 | * operations, such as delayed allocation during | ||
411 | * submission or metadata updates after data IO | ||
412 | * completion. | ||
415 | */ | 413 | */ |
416 | list_move(&inode->i_list, &inode_in_use); | 414 | redirty_tail(inode); |
417 | } else { | 415 | } else { |
418 | /* | 416 | /* |
419 | * The inode is clean, unused | 417 | * The inode is clean. At this point we either have |
418 | * a reference to the inode or it's on it's way out. | ||
419 | * No need to add it back to the LRU. | ||
420 | */ | 420 | */ |
421 | list_move(&inode->i_list, &inode_unused); | 421 | list_del_init(&inode->i_wb_list); |
422 | } | 422 | } |
423 | } | 423 | } |
424 | inode_sync_complete(inode); | 424 | inode_sync_complete(inode); |
@@ -466,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
466 | { | 466 | { |
467 | while (!list_empty(&wb->b_io)) { | 467 | while (!list_empty(&wb->b_io)) { |
468 | long pages_skipped; | 468 | long pages_skipped; |
469 | struct inode *inode = list_entry(wb->b_io.prev, | 469 | struct inode *inode = wb_inode(wb->b_io.prev); |
470 | struct inode, i_list); | ||
471 | 470 | ||
472 | if (inode->i_sb != sb) { | 471 | if (inode->i_sb != sb) { |
473 | if (only_this_sb) { | 472 | if (only_this_sb) { |
@@ -488,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
488 | return 0; | 487 | return 0; |
489 | } | 488 | } |
490 | 489 | ||
491 | if (inode->i_state & (I_NEW | I_WILL_FREE)) { | 490 | /* |
491 | * Don't bother with new inodes or inodes beeing freed, first | ||
492 | * kind does not need peridic writeout yet, and for the latter | ||
493 | * kind writeout is handled by the freer. | ||
494 | */ | ||
495 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { | ||
492 | requeue_io(inode); | 496 | requeue_io(inode); |
493 | continue; | 497 | continue; |
494 | } | 498 | } |
499 | |||
495 | /* | 500 | /* |
496 | * Was this inode dirtied after sync_sb_inodes was called? | 501 | * Was this inode dirtied after sync_sb_inodes was called? |
497 | * This keeps sync from extra jobs and livelock. | 502 | * This keeps sync from extra jobs and livelock. |
@@ -499,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
499 | if (inode_dirtied_after(inode, wbc->wb_start)) | 504 | if (inode_dirtied_after(inode, wbc->wb_start)) |
500 | return 1; | 505 | return 1; |
501 | 506 | ||
502 | BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); | ||
503 | __iget(inode); | 507 | __iget(inode); |
504 | pages_skipped = wbc->pages_skipped; | 508 | pages_skipped = wbc->pages_skipped; |
505 | writeback_single_inode(inode, wbc); | 509 | writeback_single_inode(inode, wbc); |
@@ -530,14 +534,14 @@ void writeback_inodes_wb(struct bdi_writeback *wb, | |||
530 | { | 534 | { |
531 | int ret = 0; | 535 | int ret = 0; |
532 | 536 | ||
533 | wbc->wb_start = jiffies; /* livelock avoidance */ | 537 | if (!wbc->wb_start) |
538 | wbc->wb_start = jiffies; /* livelock avoidance */ | ||
534 | spin_lock(&inode_lock); | 539 | spin_lock(&inode_lock); |
535 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | 540 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) |
536 | queue_io(wb, wbc->older_than_this); | 541 | queue_io(wb, wbc->older_than_this); |
537 | 542 | ||
538 | while (!list_empty(&wb->b_io)) { | 543 | while (!list_empty(&wb->b_io)) { |
539 | struct inode *inode = list_entry(wb->b_io.prev, | 544 | struct inode *inode = wb_inode(wb->b_io.prev); |
540 | struct inode, i_list); | ||
541 | struct super_block *sb = inode->i_sb; | 545 | struct super_block *sb = inode->i_sb; |
542 | 546 | ||
543 | if (!pin_sb_for_writeback(sb)) { | 547 | if (!pin_sb_for_writeback(sb)) { |
@@ -559,7 +563,6 @@ static void __writeback_inodes_sb(struct super_block *sb, | |||
559 | { | 563 | { |
560 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 564 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
561 | 565 | ||
562 | wbc->wb_start = jiffies; /* livelock avoidance */ | ||
563 | spin_lock(&inode_lock); | 566 | spin_lock(&inode_lock); |
564 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | 567 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) |
565 | queue_io(wb, wbc->older_than_this); | 568 | queue_io(wb, wbc->older_than_this); |
@@ -580,10 +583,10 @@ static inline bool over_bground_thresh(void) | |||
580 | { | 583 | { |
581 | unsigned long background_thresh, dirty_thresh; | 584 | unsigned long background_thresh, dirty_thresh; |
582 | 585 | ||
583 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 586 | global_dirty_limits(&background_thresh, &dirty_thresh); |
584 | 587 | ||
585 | return (global_page_state(NR_FILE_DIRTY) + | 588 | return (global_page_state(NR_FILE_DIRTY) + |
586 | global_page_state(NR_UNSTABLE_NFS) >= background_thresh); | 589 | global_page_state(NR_UNSTABLE_NFS) > background_thresh); |
587 | } | 590 | } |
588 | 591 | ||
589 | /* | 592 | /* |
@@ -625,6 +628,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
625 | wbc.range_end = LLONG_MAX; | 628 | wbc.range_end = LLONG_MAX; |
626 | } | 629 | } |
627 | 630 | ||
631 | wbc.wb_start = jiffies; /* livelock avoidance */ | ||
628 | for (;;) { | 632 | for (;;) { |
629 | /* | 633 | /* |
630 | * Stop writeback when nr_pages has been consumed | 634 | * Stop writeback when nr_pages has been consumed |
@@ -642,10 +646,14 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
642 | wbc.more_io = 0; | 646 | wbc.more_io = 0; |
643 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 647 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
644 | wbc.pages_skipped = 0; | 648 | wbc.pages_skipped = 0; |
649 | |||
650 | trace_wbc_writeback_start(&wbc, wb->bdi); | ||
645 | if (work->sb) | 651 | if (work->sb) |
646 | __writeback_inodes_sb(work->sb, wb, &wbc); | 652 | __writeback_inodes_sb(work->sb, wb, &wbc); |
647 | else | 653 | else |
648 | writeback_inodes_wb(wb, &wbc); | 654 | writeback_inodes_wb(wb, &wbc); |
655 | trace_wbc_writeback_written(&wbc, wb->bdi); | ||
656 | |||
649 | work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 657 | work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
650 | wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 658 | wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
651 | 659 | ||
@@ -671,8 +679,8 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
671 | */ | 679 | */ |
672 | spin_lock(&inode_lock); | 680 | spin_lock(&inode_lock); |
673 | if (!list_empty(&wb->b_more_io)) { | 681 | if (!list_empty(&wb->b_more_io)) { |
674 | inode = list_entry(wb->b_more_io.prev, | 682 | inode = wb_inode(wb->b_more_io.prev); |
675 | struct inode, i_list); | 683 | trace_wbc_writeback_wait(&wbc, wb->bdi); |
676 | inode_wait_for_writeback(inode); | 684 | inode_wait_for_writeback(inode); |
677 | } | 685 | } |
678 | spin_unlock(&inode_lock); | 686 | spin_unlock(&inode_lock); |
@@ -685,17 +693,17 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
685 | * Return the next wb_writeback_work struct that hasn't been processed yet. | 693 | * Return the next wb_writeback_work struct that hasn't been processed yet. |
686 | */ | 694 | */ |
687 | static struct wb_writeback_work * | 695 | static struct wb_writeback_work * |
688 | get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) | 696 | get_next_work_item(struct backing_dev_info *bdi) |
689 | { | 697 | { |
690 | struct wb_writeback_work *work = NULL; | 698 | struct wb_writeback_work *work = NULL; |
691 | 699 | ||
692 | spin_lock(&bdi->wb_lock); | 700 | spin_lock_bh(&bdi->wb_lock); |
693 | if (!list_empty(&bdi->work_list)) { | 701 | if (!list_empty(&bdi->work_list)) { |
694 | work = list_entry(bdi->work_list.next, | 702 | work = list_entry(bdi->work_list.next, |
695 | struct wb_writeback_work, list); | 703 | struct wb_writeback_work, list); |
696 | list_del_init(&work->list); | 704 | list_del_init(&work->list); |
697 | } | 705 | } |
698 | spin_unlock(&bdi->wb_lock); | 706 | spin_unlock_bh(&bdi->wb_lock); |
699 | return work; | 707 | return work; |
700 | } | 708 | } |
701 | 709 | ||
@@ -716,9 +724,13 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) | |||
716 | return 0; | 724 | return 0; |
717 | 725 | ||
718 | wb->last_old_flush = jiffies; | 726 | wb->last_old_flush = jiffies; |
727 | /* | ||
728 | * Add in the number of potentially dirty inodes, because each inode | ||
729 | * write can dirty pagecache in the underlying blockdev. | ||
730 | */ | ||
719 | nr_pages = global_page_state(NR_FILE_DIRTY) + | 731 | nr_pages = global_page_state(NR_FILE_DIRTY) + |
720 | global_page_state(NR_UNSTABLE_NFS) + | 732 | global_page_state(NR_UNSTABLE_NFS) + |
721 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 733 | get_nr_dirty_inodes(); |
722 | 734 | ||
723 | if (nr_pages) { | 735 | if (nr_pages) { |
724 | struct wb_writeback_work work = { | 736 | struct wb_writeback_work work = { |
@@ -743,7 +755,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
743 | struct wb_writeback_work *work; | 755 | struct wb_writeback_work *work; |
744 | long wrote = 0; | 756 | long wrote = 0; |
745 | 757 | ||
746 | while ((work = get_next_work_item(bdi, wb)) != NULL) { | 758 | set_bit(BDI_writeback_running, &wb->bdi->state); |
759 | while ((work = get_next_work_item(bdi)) != NULL) { | ||
747 | /* | 760 | /* |
748 | * Override sync mode, in case we must wait for completion | 761 | * Override sync mode, in case we must wait for completion |
749 | * because this thread is exiting now. | 762 | * because this thread is exiting now. |
@@ -751,6 +764,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
751 | if (force_wait) | 764 | if (force_wait) |
752 | work->sync_mode = WB_SYNC_ALL; | 765 | work->sync_mode = WB_SYNC_ALL; |
753 | 766 | ||
767 | trace_writeback_exec(bdi, work); | ||
768 | |||
754 | wrote += wb_writeback(wb, work); | 769 | wrote += wb_writeback(wb, work); |
755 | 770 | ||
756 | /* | 771 | /* |
@@ -767,6 +782,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
767 | * Check for periodic writeback, kupdated() style | 782 | * Check for periodic writeback, kupdated() style |
768 | */ | 783 | */ |
769 | wrote += wb_check_old_data_flush(wb); | 784 | wrote += wb_check_old_data_flush(wb); |
785 | clear_bit(BDI_writeback_running, &wb->bdi->state); | ||
770 | 786 | ||
771 | return wrote; | 787 | return wrote; |
772 | } | 788 | } |
@@ -775,47 +791,66 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
775 | * Handle writeback of dirty data for the device backed by this bdi. Also | 791 | * Handle writeback of dirty data for the device backed by this bdi. Also |
776 | * wakes up periodically and does kupdated style flushing. | 792 | * wakes up periodically and does kupdated style flushing. |
777 | */ | 793 | */ |
778 | int bdi_writeback_task(struct bdi_writeback *wb) | 794 | int bdi_writeback_thread(void *data) |
779 | { | 795 | { |
780 | unsigned long last_active = jiffies; | 796 | struct bdi_writeback *wb = data; |
781 | unsigned long wait_jiffies = -1UL; | 797 | struct backing_dev_info *bdi = wb->bdi; |
782 | long pages_written; | 798 | long pages_written; |
783 | 799 | ||
800 | current->flags |= PF_SWAPWRITE; | ||
801 | set_freezable(); | ||
802 | wb->last_active = jiffies; | ||
803 | |||
804 | /* | ||
805 | * Our parent may run at a different priority, just set us to normal | ||
806 | */ | ||
807 | set_user_nice(current, 0); | ||
808 | |||
809 | trace_writeback_thread_start(bdi); | ||
810 | |||
784 | while (!kthread_should_stop()) { | 811 | while (!kthread_should_stop()) { |
812 | /* | ||
813 | * Remove own delayed wake-up timer, since we are already awake | ||
814 | * and we'll take care of the preriodic write-back. | ||
815 | */ | ||
816 | del_timer(&wb->wakeup_timer); | ||
817 | |||
785 | pages_written = wb_do_writeback(wb, 0); | 818 | pages_written = wb_do_writeback(wb, 0); |
786 | 819 | ||
820 | trace_writeback_pages_written(pages_written); | ||
821 | |||
787 | if (pages_written) | 822 | if (pages_written) |
788 | last_active = jiffies; | 823 | wb->last_active = jiffies; |
789 | else if (wait_jiffies != -1UL) { | ||
790 | unsigned long max_idle; | ||
791 | 824 | ||
792 | /* | 825 | set_current_state(TASK_INTERRUPTIBLE); |
793 | * Longest period of inactivity that we tolerate. If we | 826 | if (!list_empty(&bdi->work_list) || kthread_should_stop()) { |
794 | * see dirty data again later, the task will get | 827 | __set_current_state(TASK_RUNNING); |
795 | * recreated automatically. | 828 | continue; |
796 | */ | ||
797 | max_idle = max(5UL * 60 * HZ, wait_jiffies); | ||
798 | if (time_after(jiffies, max_idle + last_active)) | ||
799 | break; | ||
800 | } | 829 | } |
801 | 830 | ||
802 | if (dirty_writeback_interval) { | 831 | if (wb_has_dirty_io(wb) && dirty_writeback_interval) |
803 | wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); | 832 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
804 | schedule_timeout_interruptible(wait_jiffies); | 833 | else { |
805 | } else { | 834 | /* |
806 | set_current_state(TASK_INTERRUPTIBLE); | 835 | * We have nothing to do, so can go sleep without any |
807 | if (list_empty_careful(&wb->bdi->work_list) && | 836 | * timeout and save power. When a work is queued or |
808 | !kthread_should_stop()) | 837 | * something is made dirty - we will be woken up. |
809 | schedule(); | 838 | */ |
810 | __set_current_state(TASK_RUNNING); | 839 | schedule(); |
811 | } | 840 | } |
812 | 841 | ||
813 | try_to_freeze(); | 842 | try_to_freeze(); |
814 | } | 843 | } |
815 | 844 | ||
845 | /* Flush any work that raced with us exiting */ | ||
846 | if (!list_empty(&bdi->work_list)) | ||
847 | wb_do_writeback(wb, 1); | ||
848 | |||
849 | trace_writeback_thread_stop(bdi); | ||
816 | return 0; | 850 | return 0; |
817 | } | 851 | } |
818 | 852 | ||
853 | |||
819 | /* | 854 | /* |
820 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | 855 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back |
821 | * the whole world. | 856 | * the whole world. |
@@ -890,6 +925,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) | |||
890 | void __mark_inode_dirty(struct inode *inode, int flags) | 925 | void __mark_inode_dirty(struct inode *inode, int flags) |
891 | { | 926 | { |
892 | struct super_block *sb = inode->i_sb; | 927 | struct super_block *sb = inode->i_sb; |
928 | struct backing_dev_info *bdi = NULL; | ||
929 | bool wakeup_bdi = false; | ||
893 | 930 | ||
894 | /* | 931 | /* |
895 | * Don't do this for I_DIRTY_PAGES - that doesn't actually | 932 | * Don't do this for I_DIRTY_PAGES - that doesn't actually |
@@ -932,10 +969,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) | |||
932 | * dirty list. Add blockdev inodes as well. | 969 | * dirty list. Add blockdev inodes as well. |
933 | */ | 970 | */ |
934 | if (!S_ISBLK(inode->i_mode)) { | 971 | if (!S_ISBLK(inode->i_mode)) { |
935 | if (hlist_unhashed(&inode->i_hash)) | 972 | if (inode_unhashed(inode)) |
936 | goto out; | 973 | goto out; |
937 | } | 974 | } |
938 | if (inode->i_state & (I_FREEING|I_CLEAR)) | 975 | if (inode->i_state & I_FREEING) |
939 | goto out; | 976 | goto out; |
940 | 977 | ||
941 | /* | 978 | /* |
@@ -943,22 +980,31 @@ void __mark_inode_dirty(struct inode *inode, int flags) | |||
943 | * reposition it (that would break b_dirty time-ordering). | 980 | * reposition it (that would break b_dirty time-ordering). |
944 | */ | 981 | */ |
945 | if (!was_dirty) { | 982 | if (!was_dirty) { |
946 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 983 | bdi = inode_to_bdi(inode); |
947 | struct backing_dev_info *bdi = wb->bdi; | 984 | |
948 | 985 | if (bdi_cap_writeback_dirty(bdi)) { | |
949 | if (bdi_cap_writeback_dirty(bdi) && | 986 | WARN(!test_bit(BDI_registered, &bdi->state), |
950 | !test_bit(BDI_registered, &bdi->state)) { | 987 | "bdi-%s not registered\n", bdi->name); |
951 | WARN_ON(1); | 988 | |
952 | printk(KERN_ERR "bdi-%s not registered\n", | 989 | /* |
953 | bdi->name); | 990 | * If this is the first dirty inode for this |
991 | * bdi, we have to wake-up the corresponding | ||
992 | * bdi thread to make sure background | ||
993 | * write-back happens later. | ||
994 | */ | ||
995 | if (!wb_has_dirty_io(&bdi->wb)) | ||
996 | wakeup_bdi = true; | ||
954 | } | 997 | } |
955 | 998 | ||
956 | inode->dirtied_when = jiffies; | 999 | inode->dirtied_when = jiffies; |
957 | list_move(&inode->i_list, &wb->b_dirty); | 1000 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); |
958 | } | 1001 | } |
959 | } | 1002 | } |
960 | out: | 1003 | out: |
961 | spin_unlock(&inode_lock); | 1004 | spin_unlock(&inode_lock); |
1005 | |||
1006 | if (wakeup_bdi) | ||
1007 | bdi_wakeup_thread_delayed(bdi); | ||
962 | } | 1008 | } |
963 | EXPORT_SYMBOL(__mark_inode_dirty); | 1009 | EXPORT_SYMBOL(__mark_inode_dirty); |
964 | 1010 | ||
@@ -1001,7 +1047,7 @@ static void wait_sb_inodes(struct super_block *sb) | |||
1001 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { | 1047 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { |
1002 | struct address_space *mapping; | 1048 | struct address_space *mapping; |
1003 | 1049 | ||
1004 | if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) | 1050 | if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) |
1005 | continue; | 1051 | continue; |
1006 | mapping = inode->i_mapping; | 1052 | mapping = inode->i_mapping; |
1007 | if (mapping->nrpages == 0) | 1053 | if (mapping->nrpages == 0) |
@@ -1051,8 +1097,7 @@ void writeback_inodes_sb(struct super_block *sb) | |||
1051 | 1097 | ||
1052 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1098 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
1053 | 1099 | ||
1054 | work.nr_pages = nr_dirty + nr_unstable + | 1100 | work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes(); |
1055 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | ||
1056 | 1101 | ||
1057 | bdi_queue_work(sb->s_bdi, &work); | 1102 | bdi_queue_work(sb->s_bdi, &work); |
1058 | wait_for_completion(&done); | 1103 | wait_for_completion(&done); |
@@ -1159,3 +1204,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) | |||
1159 | return ret; | 1204 | return ret; |
1160 | } | 1205 | } |
1161 | EXPORT_SYMBOL(sync_inode); | 1206 | EXPORT_SYMBOL(sync_inode); |
1207 | |||
1208 | /** | ||
1209 | * sync_inode - write an inode to disk | ||
1210 | * @inode: the inode to sync | ||
1211 | * @wait: wait for I/O to complete. | ||
1212 | * | ||
1213 | * Write an inode to disk and adjust it's dirty state after completion. | ||
1214 | * | ||
1215 | * Note: only writes the actual inode, no associated data or other metadata. | ||
1216 | */ | ||
1217 | int sync_inode_metadata(struct inode *inode, int wait) | ||
1218 | { | ||
1219 | struct writeback_control wbc = { | ||
1220 | .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, | ||
1221 | .nr_to_write = 0, /* metadata-only */ | ||
1222 | }; | ||
1223 | |||
1224 | return sync_inode(inode, &wbc); | ||
1225 | } | ||
1226 | EXPORT_SYMBOL(sync_inode_metadata); | ||