diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 373 |
1 files changed, 208 insertions, 165 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b8c507ca42f7..1599aa985fe2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -35,7 +35,9 @@ | |||
35 | struct wb_writeback_work { | 35 | struct wb_writeback_work { |
36 | long nr_pages; | 36 | long nr_pages; |
37 | struct super_block *sb; | 37 | struct super_block *sb; |
38 | unsigned long *older_than_this; | ||
38 | enum writeback_sync_modes sync_mode; | 39 | enum writeback_sync_modes sync_mode; |
40 | unsigned int tagged_writepages:1; | ||
39 | unsigned int for_kupdate:1; | 41 | unsigned int for_kupdate:1; |
40 | unsigned int range_cyclic:1; | 42 | unsigned int range_cyclic:1; |
41 | unsigned int for_background:1; | 43 | unsigned int for_background:1; |
@@ -180,12 +182,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) | |||
180 | */ | 182 | */ |
181 | void inode_wb_list_del(struct inode *inode) | 183 | void inode_wb_list_del(struct inode *inode) |
182 | { | 184 | { |
183 | spin_lock(&inode_wb_list_lock); | 185 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
186 | |||
187 | spin_lock(&bdi->wb.list_lock); | ||
184 | list_del_init(&inode->i_wb_list); | 188 | list_del_init(&inode->i_wb_list); |
185 | spin_unlock(&inode_wb_list_lock); | 189 | spin_unlock(&bdi->wb.list_lock); |
186 | } | 190 | } |
187 | 191 | ||
188 | |||
189 | /* | 192 | /* |
190 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the | 193 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the |
191 | * furthest end of its superblock's dirty-inode list. | 194 | * furthest end of its superblock's dirty-inode list. |
@@ -195,11 +198,9 @@ void inode_wb_list_del(struct inode *inode) | |||
195 | * the case then the inode must have been redirtied while it was being written | 198 | * the case then the inode must have been redirtied while it was being written |
196 | * out and we don't reset its dirtied_when. | 199 | * out and we don't reset its dirtied_when. |
197 | */ | 200 | */ |
198 | static void redirty_tail(struct inode *inode) | 201 | static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) |
199 | { | 202 | { |
200 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 203 | assert_spin_locked(&wb->list_lock); |
201 | |||
202 | assert_spin_locked(&inode_wb_list_lock); | ||
203 | if (!list_empty(&wb->b_dirty)) { | 204 | if (!list_empty(&wb->b_dirty)) { |
204 | struct inode *tail; | 205 | struct inode *tail; |
205 | 206 | ||
@@ -213,11 +214,9 @@ static void redirty_tail(struct inode *inode) | |||
213 | /* | 214 | /* |
214 | * requeue inode for re-scanning after bdi->b_io list is exhausted. | 215 | * requeue inode for re-scanning after bdi->b_io list is exhausted. |
215 | */ | 216 | */ |
216 | static void requeue_io(struct inode *inode) | 217 | static void requeue_io(struct inode *inode, struct bdi_writeback *wb) |
217 | { | 218 | { |
218 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 219 | assert_spin_locked(&wb->list_lock); |
219 | |||
220 | assert_spin_locked(&inode_wb_list_lock); | ||
221 | list_move(&inode->i_wb_list, &wb->b_more_io); | 220 | list_move(&inode->i_wb_list, &wb->b_more_io); |
222 | } | 221 | } |
223 | 222 | ||
@@ -225,7 +224,7 @@ static void inode_sync_complete(struct inode *inode) | |||
225 | { | 224 | { |
226 | /* | 225 | /* |
227 | * Prevent speculative execution through | 226 | * Prevent speculative execution through |
228 | * spin_unlock(&inode_wb_list_lock); | 227 | * spin_unlock(&wb->list_lock); |
229 | */ | 228 | */ |
230 | 229 | ||
231 | smp_mb(); | 230 | smp_mb(); |
@@ -250,15 +249,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) | |||
250 | /* | 249 | /* |
251 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. | 250 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. |
252 | */ | 251 | */ |
253 | static void move_expired_inodes(struct list_head *delaying_queue, | 252 | static int move_expired_inodes(struct list_head *delaying_queue, |
254 | struct list_head *dispatch_queue, | 253 | struct list_head *dispatch_queue, |
255 | unsigned long *older_than_this) | 254 | unsigned long *older_than_this) |
256 | { | 255 | { |
257 | LIST_HEAD(tmp); | 256 | LIST_HEAD(tmp); |
258 | struct list_head *pos, *node; | 257 | struct list_head *pos, *node; |
259 | struct super_block *sb = NULL; | 258 | struct super_block *sb = NULL; |
260 | struct inode *inode; | 259 | struct inode *inode; |
261 | int do_sb_sort = 0; | 260 | int do_sb_sort = 0; |
261 | int moved = 0; | ||
262 | 262 | ||
263 | while (!list_empty(delaying_queue)) { | 263 | while (!list_empty(delaying_queue)) { |
264 | inode = wb_inode(delaying_queue->prev); | 264 | inode = wb_inode(delaying_queue->prev); |
@@ -269,12 +269,13 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
269 | do_sb_sort = 1; | 269 | do_sb_sort = 1; |
270 | sb = inode->i_sb; | 270 | sb = inode->i_sb; |
271 | list_move(&inode->i_wb_list, &tmp); | 271 | list_move(&inode->i_wb_list, &tmp); |
272 | moved++; | ||
272 | } | 273 | } |
273 | 274 | ||
274 | /* just one sb in list, splice to dispatch_queue and we're done */ | 275 | /* just one sb in list, splice to dispatch_queue and we're done */ |
275 | if (!do_sb_sort) { | 276 | if (!do_sb_sort) { |
276 | list_splice(&tmp, dispatch_queue); | 277 | list_splice(&tmp, dispatch_queue); |
277 | return; | 278 | goto out; |
278 | } | 279 | } |
279 | 280 | ||
280 | /* Move inodes from one superblock together */ | 281 | /* Move inodes from one superblock together */ |
@@ -286,6 +287,8 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
286 | list_move(&inode->i_wb_list, dispatch_queue); | 287 | list_move(&inode->i_wb_list, dispatch_queue); |
287 | } | 288 | } |
288 | } | 289 | } |
290 | out: | ||
291 | return moved; | ||
289 | } | 292 | } |
290 | 293 | ||
291 | /* | 294 | /* |
@@ -301,9 +304,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
301 | */ | 304 | */ |
302 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) | 305 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) |
303 | { | 306 | { |
304 | assert_spin_locked(&inode_wb_list_lock); | 307 | int moved; |
308 | assert_spin_locked(&wb->list_lock); | ||
305 | list_splice_init(&wb->b_more_io, &wb->b_io); | 309 | list_splice_init(&wb->b_more_io, &wb->b_io); |
306 | move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | 310 | moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); |
311 | trace_writeback_queue_io(wb, older_than_this, moved); | ||
307 | } | 312 | } |
308 | 313 | ||
309 | static int write_inode(struct inode *inode, struct writeback_control *wbc) | 314 | static int write_inode(struct inode *inode, struct writeback_control *wbc) |
@@ -316,7 +321,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) | |||
316 | /* | 321 | /* |
317 | * Wait for writeback on an inode to complete. | 322 | * Wait for writeback on an inode to complete. |
318 | */ | 323 | */ |
319 | static void inode_wait_for_writeback(struct inode *inode) | 324 | static void inode_wait_for_writeback(struct inode *inode, |
325 | struct bdi_writeback *wb) | ||
320 | { | 326 | { |
321 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); | 327 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); |
322 | wait_queue_head_t *wqh; | 328 | wait_queue_head_t *wqh; |
@@ -324,15 +330,15 @@ static void inode_wait_for_writeback(struct inode *inode) | |||
324 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); | 330 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
325 | while (inode->i_state & I_SYNC) { | 331 | while (inode->i_state & I_SYNC) { |
326 | spin_unlock(&inode->i_lock); | 332 | spin_unlock(&inode->i_lock); |
327 | spin_unlock(&inode_wb_list_lock); | 333 | spin_unlock(&wb->list_lock); |
328 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); | 334 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
329 | spin_lock(&inode_wb_list_lock); | 335 | spin_lock(&wb->list_lock); |
330 | spin_lock(&inode->i_lock); | 336 | spin_lock(&inode->i_lock); |
331 | } | 337 | } |
332 | } | 338 | } |
333 | 339 | ||
334 | /* | 340 | /* |
335 | * Write out an inode's dirty pages. Called under inode_wb_list_lock and | 341 | * Write out an inode's dirty pages. Called under wb->list_lock and |
336 | * inode->i_lock. Either the caller has an active reference on the inode or | 342 | * inode->i_lock. Either the caller has an active reference on the inode or |
337 | * the inode has I_WILL_FREE set. | 343 | * the inode has I_WILL_FREE set. |
338 | * | 344 | * |
@@ -343,13 +349,15 @@ static void inode_wait_for_writeback(struct inode *inode) | |||
343 | * livelocks, etc. | 349 | * livelocks, etc. |
344 | */ | 350 | */ |
345 | static int | 351 | static int |
346 | writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | 352 | writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, |
353 | struct writeback_control *wbc) | ||
347 | { | 354 | { |
348 | struct address_space *mapping = inode->i_mapping; | 355 | struct address_space *mapping = inode->i_mapping; |
356 | long nr_to_write = wbc->nr_to_write; | ||
349 | unsigned dirty; | 357 | unsigned dirty; |
350 | int ret; | 358 | int ret; |
351 | 359 | ||
352 | assert_spin_locked(&inode_wb_list_lock); | 360 | assert_spin_locked(&wb->list_lock); |
353 | assert_spin_locked(&inode->i_lock); | 361 | assert_spin_locked(&inode->i_lock); |
354 | 362 | ||
355 | if (!atomic_read(&inode->i_count)) | 363 | if (!atomic_read(&inode->i_count)) |
@@ -367,14 +375,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
367 | * completed a full scan of b_io. | 375 | * completed a full scan of b_io. |
368 | */ | 376 | */ |
369 | if (wbc->sync_mode != WB_SYNC_ALL) { | 377 | if (wbc->sync_mode != WB_SYNC_ALL) { |
370 | requeue_io(inode); | 378 | requeue_io(inode, wb); |
379 | trace_writeback_single_inode_requeue(inode, wbc, | ||
380 | nr_to_write); | ||
371 | return 0; | 381 | return 0; |
372 | } | 382 | } |
373 | 383 | ||
374 | /* | 384 | /* |
375 | * It's a data-integrity sync. We must wait. | 385 | * It's a data-integrity sync. We must wait. |
376 | */ | 386 | */ |
377 | inode_wait_for_writeback(inode); | 387 | inode_wait_for_writeback(inode, wb); |
378 | } | 388 | } |
379 | 389 | ||
380 | BUG_ON(inode->i_state & I_SYNC); | 390 | BUG_ON(inode->i_state & I_SYNC); |
@@ -383,7 +393,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
383 | inode->i_state |= I_SYNC; | 393 | inode->i_state |= I_SYNC; |
384 | inode->i_state &= ~I_DIRTY_PAGES; | 394 | inode->i_state &= ~I_DIRTY_PAGES; |
385 | spin_unlock(&inode->i_lock); | 395 | spin_unlock(&inode->i_lock); |
386 | spin_unlock(&inode_wb_list_lock); | 396 | spin_unlock(&wb->list_lock); |
387 | 397 | ||
388 | ret = do_writepages(mapping, wbc); | 398 | ret = do_writepages(mapping, wbc); |
389 | 399 | ||
@@ -414,10 +424,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
414 | ret = err; | 424 | ret = err; |
415 | } | 425 | } |
416 | 426 | ||
417 | spin_lock(&inode_wb_list_lock); | 427 | spin_lock(&wb->list_lock); |
418 | spin_lock(&inode->i_lock); | 428 | spin_lock(&inode->i_lock); |
419 | inode->i_state &= ~I_SYNC; | 429 | inode->i_state &= ~I_SYNC; |
420 | if (!(inode->i_state & I_FREEING)) { | 430 | if (!(inode->i_state & I_FREEING)) { |
431 | /* | ||
432 | * Sync livelock prevention. Each inode is tagged and synced in | ||
433 | * one shot. If still dirty, it will be redirty_tail()'ed below. | ||
434 | * Update the dirty time to prevent enqueue and sync it again. | ||
435 | */ | ||
436 | if ((inode->i_state & I_DIRTY) && | ||
437 | (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) | ||
438 | inode->dirtied_when = jiffies; | ||
439 | |||
421 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | 440 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { |
422 | /* | 441 | /* |
423 | * We didn't write back all the pages. nfs_writepages() | 442 | * We didn't write back all the pages. nfs_writepages() |
@@ -428,7 +447,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
428 | /* | 447 | /* |
429 | * slice used up: queue for next turn | 448 | * slice used up: queue for next turn |
430 | */ | 449 | */ |
431 | requeue_io(inode); | 450 | requeue_io(inode, wb); |
432 | } else { | 451 | } else { |
433 | /* | 452 | /* |
434 | * Writeback blocked by something other than | 453 | * Writeback blocked by something other than |
@@ -437,7 +456,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
437 | * retrying writeback of the dirty page/inode | 456 | * retrying writeback of the dirty page/inode |
438 | * that cannot be performed immediately. | 457 | * that cannot be performed immediately. |
439 | */ | 458 | */ |
440 | redirty_tail(inode); | 459 | redirty_tail(inode, wb); |
441 | } | 460 | } |
442 | } else if (inode->i_state & I_DIRTY) { | 461 | } else if (inode->i_state & I_DIRTY) { |
443 | /* | 462 | /* |
@@ -446,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
446 | * submission or metadata updates after data IO | 465 | * submission or metadata updates after data IO |
447 | * completion. | 466 | * completion. |
448 | */ | 467 | */ |
449 | redirty_tail(inode); | 468 | redirty_tail(inode, wb); |
450 | } else { | 469 | } else { |
451 | /* | 470 | /* |
452 | * The inode is clean. At this point we either have | 471 | * The inode is clean. At this point we either have |
@@ -457,9 +476,41 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
457 | } | 476 | } |
458 | } | 477 | } |
459 | inode_sync_complete(inode); | 478 | inode_sync_complete(inode); |
479 | trace_writeback_single_inode(inode, wbc, nr_to_write); | ||
460 | return ret; | 480 | return ret; |
461 | } | 481 | } |
462 | 482 | ||
483 | static long writeback_chunk_size(struct backing_dev_info *bdi, | ||
484 | struct wb_writeback_work *work) | ||
485 | { | ||
486 | long pages; | ||
487 | |||
488 | /* | ||
489 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
490 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
491 | * here avoids calling into writeback_inodes_wb() more than once. | ||
492 | * | ||
493 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
494 | * | ||
495 | * wb_writeback() | ||
496 | * writeback_sb_inodes() <== called only once | ||
497 | * write_cache_pages() <== called once for each inode | ||
498 | * (quickly) tag currently dirty pages | ||
499 | * (maybe slowly) sync all tagged pages | ||
500 | */ | ||
501 | if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) | ||
502 | pages = LONG_MAX; | ||
503 | else { | ||
504 | pages = min(bdi->avg_write_bandwidth / 2, | ||
505 | global_dirty_limit / DIRTY_SCOPE); | ||
506 | pages = min(pages, work->nr_pages); | ||
507 | pages = round_down(pages + MIN_WRITEBACK_PAGES, | ||
508 | MIN_WRITEBACK_PAGES); | ||
509 | } | ||
510 | |||
511 | return pages; | ||
512 | } | ||
513 | |||
463 | /* | 514 | /* |
464 | * Write a portion of b_io inodes which belong to @sb. | 515 | * Write a portion of b_io inodes which belong to @sb. |
465 | * | 516 | * |
@@ -467,24 +518,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
467 | * inodes. Otherwise write only ones which go sequentially | 518 | * inodes. Otherwise write only ones which go sequentially |
468 | * in reverse order. | 519 | * in reverse order. |
469 | * | 520 | * |
470 | * Return 1, if the caller writeback routine should be | 521 | * Return the number of pages and/or inodes written. |
471 | * interrupted. Otherwise return 0. | ||
472 | */ | 522 | */ |
473 | static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | 523 | static long writeback_sb_inodes(struct super_block *sb, |
474 | struct writeback_control *wbc, bool only_this_sb) | 524 | struct bdi_writeback *wb, |
525 | struct wb_writeback_work *work) | ||
475 | { | 526 | { |
527 | struct writeback_control wbc = { | ||
528 | .sync_mode = work->sync_mode, | ||
529 | .tagged_writepages = work->tagged_writepages, | ||
530 | .for_kupdate = work->for_kupdate, | ||
531 | .for_background = work->for_background, | ||
532 | .range_cyclic = work->range_cyclic, | ||
533 | .range_start = 0, | ||
534 | .range_end = LLONG_MAX, | ||
535 | }; | ||
536 | unsigned long start_time = jiffies; | ||
537 | long write_chunk; | ||
538 | long wrote = 0; /* count both pages and inodes */ | ||
539 | |||
476 | while (!list_empty(&wb->b_io)) { | 540 | while (!list_empty(&wb->b_io)) { |
477 | long pages_skipped; | ||
478 | struct inode *inode = wb_inode(wb->b_io.prev); | 541 | struct inode *inode = wb_inode(wb->b_io.prev); |
479 | 542 | ||
480 | if (inode->i_sb != sb) { | 543 | if (inode->i_sb != sb) { |
481 | if (only_this_sb) { | 544 | if (work->sb) { |
482 | /* | 545 | /* |
483 | * We only want to write back data for this | 546 | * We only want to write back data for this |
484 | * superblock, move all inodes not belonging | 547 | * superblock, move all inodes not belonging |
485 | * to it back onto the dirty list. | 548 | * to it back onto the dirty list. |
486 | */ | 549 | */ |
487 | redirty_tail(inode); | 550 | redirty_tail(inode, wb); |
488 | continue; | 551 | continue; |
489 | } | 552 | } |
490 | 553 | ||
@@ -493,7 +556,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
493 | * Bounce back to the caller to unpin this and | 556 | * Bounce back to the caller to unpin this and |
494 | * pin the next superblock. | 557 | * pin the next superblock. |
495 | */ | 558 | */ |
496 | return 0; | 559 | break; |
497 | } | 560 | } |
498 | 561 | ||
499 | /* | 562 | /* |
@@ -504,95 +567,91 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
504 | spin_lock(&inode->i_lock); | 567 | spin_lock(&inode->i_lock); |
505 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { | 568 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { |
506 | spin_unlock(&inode->i_lock); | 569 | spin_unlock(&inode->i_lock); |
507 | requeue_io(inode); | 570 | redirty_tail(inode, wb); |
508 | continue; | 571 | continue; |
509 | } | 572 | } |
510 | |||
511 | /* | ||
512 | * Was this inode dirtied after sync_sb_inodes was called? | ||
513 | * This keeps sync from extra jobs and livelock. | ||
514 | */ | ||
515 | if (inode_dirtied_after(inode, wbc->wb_start)) { | ||
516 | spin_unlock(&inode->i_lock); | ||
517 | return 1; | ||
518 | } | ||
519 | |||
520 | __iget(inode); | 573 | __iget(inode); |
574 | write_chunk = writeback_chunk_size(wb->bdi, work); | ||
575 | wbc.nr_to_write = write_chunk; | ||
576 | wbc.pages_skipped = 0; | ||
521 | 577 | ||
522 | pages_skipped = wbc->pages_skipped; | 578 | writeback_single_inode(inode, wb, &wbc); |
523 | writeback_single_inode(inode, wbc); | 579 | |
524 | if (wbc->pages_skipped != pages_skipped) { | 580 | work->nr_pages -= write_chunk - wbc.nr_to_write; |
581 | wrote += write_chunk - wbc.nr_to_write; | ||
582 | if (!(inode->i_state & I_DIRTY)) | ||
583 | wrote++; | ||
584 | if (wbc.pages_skipped) { | ||
525 | /* | 585 | /* |
526 | * writeback is not making progress due to locked | 586 | * writeback is not making progress due to locked |
527 | * buffers. Skip this inode for now. | 587 | * buffers. Skip this inode for now. |
528 | */ | 588 | */ |
529 | redirty_tail(inode); | 589 | redirty_tail(inode, wb); |
530 | } | 590 | } |
531 | spin_unlock(&inode->i_lock); | 591 | spin_unlock(&inode->i_lock); |
532 | spin_unlock(&inode_wb_list_lock); | 592 | spin_unlock(&wb->list_lock); |
533 | iput(inode); | 593 | iput(inode); |
534 | cond_resched(); | 594 | cond_resched(); |
535 | spin_lock(&inode_wb_list_lock); | 595 | spin_lock(&wb->list_lock); |
536 | if (wbc->nr_to_write <= 0) { | 596 | /* |
537 | wbc->more_io = 1; | 597 | * bail out to wb_writeback() often enough to check |
538 | return 1; | 598 | * background threshold and other termination conditions. |
599 | */ | ||
600 | if (wrote) { | ||
601 | if (time_is_before_jiffies(start_time + HZ / 10UL)) | ||
602 | break; | ||
603 | if (work->nr_pages <= 0) | ||
604 | break; | ||
539 | } | 605 | } |
540 | if (!list_empty(&wb->b_more_io)) | ||
541 | wbc->more_io = 1; | ||
542 | } | 606 | } |
543 | /* b_io is empty */ | 607 | return wrote; |
544 | return 1; | ||
545 | } | 608 | } |
546 | 609 | ||
547 | void writeback_inodes_wb(struct bdi_writeback *wb, | 610 | static long __writeback_inodes_wb(struct bdi_writeback *wb, |
548 | struct writeback_control *wbc) | 611 | struct wb_writeback_work *work) |
549 | { | 612 | { |
550 | int ret = 0; | 613 | unsigned long start_time = jiffies; |
551 | 614 | long wrote = 0; | |
552 | if (!wbc->wb_start) | ||
553 | wbc->wb_start = jiffies; /* livelock avoidance */ | ||
554 | spin_lock(&inode_wb_list_lock); | ||
555 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | ||
556 | queue_io(wb, wbc->older_than_this); | ||
557 | 615 | ||
558 | while (!list_empty(&wb->b_io)) { | 616 | while (!list_empty(&wb->b_io)) { |
559 | struct inode *inode = wb_inode(wb->b_io.prev); | 617 | struct inode *inode = wb_inode(wb->b_io.prev); |
560 | struct super_block *sb = inode->i_sb; | 618 | struct super_block *sb = inode->i_sb; |
561 | 619 | ||
562 | if (!grab_super_passive(sb)) { | 620 | if (!grab_super_passive(sb)) { |
563 | requeue_io(inode); | 621 | requeue_io(inode, wb); |
564 | continue; | 622 | continue; |
565 | } | 623 | } |
566 | ret = writeback_sb_inodes(sb, wb, wbc, false); | 624 | wrote += writeback_sb_inodes(sb, wb, work); |
567 | drop_super(sb); | 625 | drop_super(sb); |
568 | 626 | ||
569 | if (ret) | 627 | /* refer to the same tests at the end of writeback_sb_inodes */ |
570 | break; | 628 | if (wrote) { |
629 | if (time_is_before_jiffies(start_time + HZ / 10UL)) | ||
630 | break; | ||
631 | if (work->nr_pages <= 0) | ||
632 | break; | ||
633 | } | ||
571 | } | 634 | } |
572 | spin_unlock(&inode_wb_list_lock); | ||
573 | /* Leave any unwritten inodes on b_io */ | 635 | /* Leave any unwritten inodes on b_io */ |
636 | return wrote; | ||
574 | } | 637 | } |
575 | 638 | ||
576 | static void __writeback_inodes_sb(struct super_block *sb, | 639 | long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) |
577 | struct bdi_writeback *wb, struct writeback_control *wbc) | ||
578 | { | 640 | { |
579 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 641 | struct wb_writeback_work work = { |
642 | .nr_pages = nr_pages, | ||
643 | .sync_mode = WB_SYNC_NONE, | ||
644 | .range_cyclic = 1, | ||
645 | }; | ||
580 | 646 | ||
581 | spin_lock(&inode_wb_list_lock); | 647 | spin_lock(&wb->list_lock); |
582 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | 648 | if (list_empty(&wb->b_io)) |
583 | queue_io(wb, wbc->older_than_this); | 649 | queue_io(wb, NULL); |
584 | writeback_sb_inodes(sb, wb, wbc, true); | 650 | __writeback_inodes_wb(wb, &work); |
585 | spin_unlock(&inode_wb_list_lock); | 651 | spin_unlock(&wb->list_lock); |
586 | } | ||
587 | 652 | ||
588 | /* | 653 | return nr_pages - work.nr_pages; |
589 | * The maximum number of pages to writeout in a single bdi flush/kupdate | 654 | } |
590 | * operation. We do this so we don't hold I_SYNC against an inode for | ||
591 | * enormous amounts of time, which would block a userspace task which has | ||
592 | * been forced to throttle against that inode. Also, the code reevaluates | ||
593 | * the dirty each time it has written this many pages. | ||
594 | */ | ||
595 | #define MAX_WRITEBACK_PAGES 1024 | ||
596 | 655 | ||
597 | static inline bool over_bground_thresh(void) | 656 | static inline bool over_bground_thresh(void) |
598 | { | 657 | { |
@@ -605,6 +664,16 @@ static inline bool over_bground_thresh(void) | |||
605 | } | 664 | } |
606 | 665 | ||
607 | /* | 666 | /* |
667 | * Called under wb->list_lock. If there are multiple wb per bdi, | ||
668 | * only the flusher working on the first wb should do it. | ||
669 | */ | ||
670 | static void wb_update_bandwidth(struct bdi_writeback *wb, | ||
671 | unsigned long start_time) | ||
672 | { | ||
673 | __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); | ||
674 | } | ||
675 | |||
676 | /* | ||
608 | * Explicit flushing or periodic writeback of "old" data. | 677 | * Explicit flushing or periodic writeback of "old" data. |
609 | * | 678 | * |
610 | * Define "old": the first time one of an inode's pages is dirtied, we mark the | 679 | * Define "old": the first time one of an inode's pages is dirtied, we mark the |
@@ -622,47 +691,16 @@ static inline bool over_bground_thresh(void) | |||
622 | static long wb_writeback(struct bdi_writeback *wb, | 691 | static long wb_writeback(struct bdi_writeback *wb, |
623 | struct wb_writeback_work *work) | 692 | struct wb_writeback_work *work) |
624 | { | 693 | { |
625 | struct writeback_control wbc = { | 694 | unsigned long wb_start = jiffies; |
626 | .sync_mode = work->sync_mode, | 695 | long nr_pages = work->nr_pages; |
627 | .older_than_this = NULL, | ||
628 | .for_kupdate = work->for_kupdate, | ||
629 | .for_background = work->for_background, | ||
630 | .range_cyclic = work->range_cyclic, | ||
631 | }; | ||
632 | unsigned long oldest_jif; | 696 | unsigned long oldest_jif; |
633 | long wrote = 0; | ||
634 | long write_chunk; | ||
635 | struct inode *inode; | 697 | struct inode *inode; |
698 | long progress; | ||
636 | 699 | ||
637 | if (wbc.for_kupdate) { | 700 | oldest_jif = jiffies; |
638 | wbc.older_than_this = &oldest_jif; | 701 | work->older_than_this = &oldest_jif; |
639 | oldest_jif = jiffies - | ||
640 | msecs_to_jiffies(dirty_expire_interval * 10); | ||
641 | } | ||
642 | if (!wbc.range_cyclic) { | ||
643 | wbc.range_start = 0; | ||
644 | wbc.range_end = LLONG_MAX; | ||
645 | } | ||
646 | 702 | ||
647 | /* | 703 | spin_lock(&wb->list_lock); |
648 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
649 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
650 | * here avoids calling into writeback_inodes_wb() more than once. | ||
651 | * | ||
652 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
653 | * | ||
654 | * wb_writeback() | ||
655 | * __writeback_inodes_sb() <== called only once | ||
656 | * write_cache_pages() <== called once for each inode | ||
657 | * (quickly) tag currently dirty pages | ||
658 | * (maybe slowly) sync all tagged pages | ||
659 | */ | ||
660 | if (wbc.sync_mode == WB_SYNC_NONE) | ||
661 | write_chunk = MAX_WRITEBACK_PAGES; | ||
662 | else | ||
663 | write_chunk = LONG_MAX; | ||
664 | |||
665 | wbc.wb_start = jiffies; /* livelock avoidance */ | ||
666 | for (;;) { | 704 | for (;;) { |
667 | /* | 705 | /* |
668 | * Stop writeback when nr_pages has been consumed | 706 | * Stop writeback when nr_pages has been consumed |
@@ -687,52 +725,54 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
687 | if (work->for_background && !over_bground_thresh()) | 725 | if (work->for_background && !over_bground_thresh()) |
688 | break; | 726 | break; |
689 | 727 | ||
690 | wbc.more_io = 0; | 728 | if (work->for_kupdate) { |
691 | wbc.nr_to_write = write_chunk; | 729 | oldest_jif = jiffies - |
692 | wbc.pages_skipped = 0; | 730 | msecs_to_jiffies(dirty_expire_interval * 10); |
731 | work->older_than_this = &oldest_jif; | ||
732 | } | ||
693 | 733 | ||
694 | trace_wbc_writeback_start(&wbc, wb->bdi); | 734 | trace_writeback_start(wb->bdi, work); |
735 | if (list_empty(&wb->b_io)) | ||
736 | queue_io(wb, work->older_than_this); | ||
695 | if (work->sb) | 737 | if (work->sb) |
696 | __writeback_inodes_sb(work->sb, wb, &wbc); | 738 | progress = writeback_sb_inodes(work->sb, wb, work); |
697 | else | 739 | else |
698 | writeback_inodes_wb(wb, &wbc); | 740 | progress = __writeback_inodes_wb(wb, work); |
699 | trace_wbc_writeback_written(&wbc, wb->bdi); | 741 | trace_writeback_written(wb->bdi, work); |
700 | 742 | ||
701 | work->nr_pages -= write_chunk - wbc.nr_to_write; | 743 | wb_update_bandwidth(wb, wb_start); |
702 | wrote += write_chunk - wbc.nr_to_write; | ||
703 | 744 | ||
704 | /* | 745 | /* |
705 | * If we consumed everything, see if we have more | 746 | * Did we write something? Try for more |
747 | * | ||
748 | * Dirty inodes are moved to b_io for writeback in batches. | ||
749 | * The completion of the current batch does not necessarily | ||
750 | * mean the overall work is done. So we keep looping as long | ||
751 | * as made some progress on cleaning pages or inodes. | ||
706 | */ | 752 | */ |
707 | if (wbc.nr_to_write <= 0) | 753 | if (progress) |
708 | continue; | 754 | continue; |
709 | /* | 755 | /* |
710 | * Didn't write everything and we don't have more IO, bail | 756 | * No more inodes for IO, bail |
711 | */ | 757 | */ |
712 | if (!wbc.more_io) | 758 | if (list_empty(&wb->b_more_io)) |
713 | break; | 759 | break; |
714 | /* | 760 | /* |
715 | * Did we write something? Try for more | ||
716 | */ | ||
717 | if (wbc.nr_to_write < write_chunk) | ||
718 | continue; | ||
719 | /* | ||
720 | * Nothing written. Wait for some inode to | 761 | * Nothing written. Wait for some inode to |
721 | * become available for writeback. Otherwise | 762 | * become available for writeback. Otherwise |
722 | * we'll just busyloop. | 763 | * we'll just busyloop. |
723 | */ | 764 | */ |
724 | spin_lock(&inode_wb_list_lock); | ||
725 | if (!list_empty(&wb->b_more_io)) { | 765 | if (!list_empty(&wb->b_more_io)) { |
766 | trace_writeback_wait(wb->bdi, work); | ||
726 | inode = wb_inode(wb->b_more_io.prev); | 767 | inode = wb_inode(wb->b_more_io.prev); |
727 | trace_wbc_writeback_wait(&wbc, wb->bdi); | ||
728 | spin_lock(&inode->i_lock); | 768 | spin_lock(&inode->i_lock); |
729 | inode_wait_for_writeback(inode); | 769 | inode_wait_for_writeback(inode, wb); |
730 | spin_unlock(&inode->i_lock); | 770 | spin_unlock(&inode->i_lock); |
731 | } | 771 | } |
732 | spin_unlock(&inode_wb_list_lock); | ||
733 | } | 772 | } |
773 | spin_unlock(&wb->list_lock); | ||
734 | 774 | ||
735 | return wrote; | 775 | return nr_pages - work->nr_pages; |
736 | } | 776 | } |
737 | 777 | ||
738 | /* | 778 | /* |
@@ -1063,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) | |||
1063 | } | 1103 | } |
1064 | 1104 | ||
1065 | spin_unlock(&inode->i_lock); | 1105 | spin_unlock(&inode->i_lock); |
1066 | spin_lock(&inode_wb_list_lock); | 1106 | spin_lock(&bdi->wb.list_lock); |
1067 | inode->dirtied_when = jiffies; | 1107 | inode->dirtied_when = jiffies; |
1068 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); | 1108 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); |
1069 | spin_unlock(&inode_wb_list_lock); | 1109 | spin_unlock(&bdi->wb.list_lock); |
1070 | 1110 | ||
1071 | if (wakeup_bdi) | 1111 | if (wakeup_bdi) |
1072 | bdi_wakeup_thread_delayed(bdi); | 1112 | bdi_wakeup_thread_delayed(bdi); |
@@ -1162,10 +1202,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) | |||
1162 | { | 1202 | { |
1163 | DECLARE_COMPLETION_ONSTACK(done); | 1203 | DECLARE_COMPLETION_ONSTACK(done); |
1164 | struct wb_writeback_work work = { | 1204 | struct wb_writeback_work work = { |
1165 | .sb = sb, | 1205 | .sb = sb, |
1166 | .sync_mode = WB_SYNC_NONE, | 1206 | .sync_mode = WB_SYNC_NONE, |
1167 | .done = &done, | 1207 | .tagged_writepages = 1, |
1168 | .nr_pages = nr, | 1208 | .done = &done, |
1209 | .nr_pages = nr, | ||
1169 | }; | 1210 | }; |
1170 | 1211 | ||
1171 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1212 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
@@ -1267,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb); | |||
1267 | */ | 1308 | */ |
1268 | int write_inode_now(struct inode *inode, int sync) | 1309 | int write_inode_now(struct inode *inode, int sync) |
1269 | { | 1310 | { |
1311 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | ||
1270 | int ret; | 1312 | int ret; |
1271 | struct writeback_control wbc = { | 1313 | struct writeback_control wbc = { |
1272 | .nr_to_write = LONG_MAX, | 1314 | .nr_to_write = LONG_MAX, |
@@ -1279,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync) | |||
1279 | wbc.nr_to_write = 0; | 1321 | wbc.nr_to_write = 0; |
1280 | 1322 | ||
1281 | might_sleep(); | 1323 | might_sleep(); |
1282 | spin_lock(&inode_wb_list_lock); | 1324 | spin_lock(&wb->list_lock); |
1283 | spin_lock(&inode->i_lock); | 1325 | spin_lock(&inode->i_lock); |
1284 | ret = writeback_single_inode(inode, &wbc); | 1326 | ret = writeback_single_inode(inode, wb, &wbc); |
1285 | spin_unlock(&inode->i_lock); | 1327 | spin_unlock(&inode->i_lock); |
1286 | spin_unlock(&inode_wb_list_lock); | 1328 | spin_unlock(&wb->list_lock); |
1287 | if (sync) | 1329 | if (sync) |
1288 | inode_sync_wait(inode); | 1330 | inode_sync_wait(inode); |
1289 | return ret; | 1331 | return ret; |
@@ -1303,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now); | |||
1303 | */ | 1345 | */ |
1304 | int sync_inode(struct inode *inode, struct writeback_control *wbc) | 1346 | int sync_inode(struct inode *inode, struct writeback_control *wbc) |
1305 | { | 1347 | { |
1348 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | ||
1306 | int ret; | 1349 | int ret; |
1307 | 1350 | ||
1308 | spin_lock(&inode_wb_list_lock); | 1351 | spin_lock(&wb->list_lock); |
1309 | spin_lock(&inode->i_lock); | 1352 | spin_lock(&inode->i_lock); |
1310 | ret = writeback_single_inode(inode, wbc); | 1353 | ret = writeback_single_inode(inode, wb, wbc); |
1311 | spin_unlock(&inode->i_lock); | 1354 | spin_unlock(&inode->i_lock); |
1312 | spin_unlock(&inode_wb_list_lock); | 1355 | spin_unlock(&wb->list_lock); |
1313 | return ret; | 1356 | return ret; |
1314 | } | 1357 | } |
1315 | EXPORT_SYMBOL(sync_inode); | 1358 | EXPORT_SYMBOL(sync_inode); |