aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c336
1 files changed, 205 insertions, 131 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 539f36cf3e4a..8d2fb8c88cf3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -231,11 +231,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
231 231
232static void inode_sync_complete(struct inode *inode) 232static void inode_sync_complete(struct inode *inode)
233{ 233{
234 /* 234 inode->i_state &= ~I_SYNC;
235 * Prevent speculative execution through 235 /* Waiters must see I_SYNC cleared before being woken up */
236 * spin_unlock(&wb->list_lock);
237 */
238
239 smp_mb(); 236 smp_mb();
240 wake_up_bit(&inode->i_state, __I_SYNC); 237 wake_up_bit(&inode->i_state, __I_SYNC);
241} 238}
@@ -329,10 +326,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
329} 326}
330 327
331/* 328/*
332 * Wait for writeback on an inode to complete. 329 * Wait for writeback on an inode to complete. Called with i_lock held.
330 * Caller must make sure inode cannot go away when we drop i_lock.
333 */ 331 */
334static void inode_wait_for_writeback(struct inode *inode, 332static void __inode_wait_for_writeback(struct inode *inode)
335 struct bdi_writeback *wb) 333 __releases(inode->i_lock)
334 __acquires(inode->i_lock)
336{ 335{
337 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 336 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
338 wait_queue_head_t *wqh; 337 wait_queue_head_t *wqh;
@@ -340,70 +339,119 @@ static void inode_wait_for_writeback(struct inode *inode,
340 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 339 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
341 while (inode->i_state & I_SYNC) { 340 while (inode->i_state & I_SYNC) {
342 spin_unlock(&inode->i_lock); 341 spin_unlock(&inode->i_lock);
343 spin_unlock(&wb->list_lock);
344 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 342 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
345 spin_lock(&wb->list_lock);
346 spin_lock(&inode->i_lock); 343 spin_lock(&inode->i_lock);
347 } 344 }
348} 345}
349 346
350/* 347/*
351 * Write out an inode's dirty pages. Called under wb->list_lock and 348 * Wait for writeback on an inode to complete. Caller must have inode pinned.
352 * inode->i_lock. Either the caller has an active reference on the inode or
353 * the inode has I_WILL_FREE set.
354 *
355 * If `wait' is set, wait on the writeout.
356 *
357 * The whole writeout design is quite complex and fragile. We want to avoid
358 * starvation of particular inodes when others are being redirtied, prevent
359 * livelocks, etc.
360 */ 349 */
361static int 350void inode_wait_for_writeback(struct inode *inode)
362writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
363 struct writeback_control *wbc)
364{ 351{
365 struct address_space *mapping = inode->i_mapping; 352 spin_lock(&inode->i_lock);
366 long nr_to_write = wbc->nr_to_write; 353 __inode_wait_for_writeback(inode);
367 unsigned dirty; 354 spin_unlock(&inode->i_lock);
368 int ret; 355}
369 356
370 assert_spin_locked(&wb->list_lock); 357/*
371 assert_spin_locked(&inode->i_lock); 358 * Sleep until I_SYNC is cleared. This function must be called with i_lock
359 * held and drops it. It is aimed for callers not holding any inode reference
360 * so once i_lock is dropped, inode can go away.
361 */
362static void inode_sleep_on_writeback(struct inode *inode)
363 __releases(inode->i_lock)
364{
365 DEFINE_WAIT(wait);
366 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
367 int sleep;
372 368
373 if (!atomic_read(&inode->i_count)) 369 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
374 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 370 sleep = inode->i_state & I_SYNC;
375 else 371 spin_unlock(&inode->i_lock);
376 WARN_ON(inode->i_state & I_WILL_FREE); 372 if (sleep)
373 schedule();
374 finish_wait(wqh, &wait);
375}
377 376
378 if (inode->i_state & I_SYNC) { 377/*
378 * Find proper writeback list for the inode depending on its current state and
379 * possibly also change of its state while we were doing writeback. Here we
380 * handle things such as livelock prevention or fairness of writeback among
381 * inodes. This function can be called only by flusher thread - noone else
382 * processes all inodes in writeback lists and requeueing inodes behind flusher
383 * thread's back can have unexpected consequences.
384 */
385static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
386 struct writeback_control *wbc)
387{
388 if (inode->i_state & I_FREEING)
389 return;
390
391 /*
392 * Sync livelock prevention. Each inode is tagged and synced in one
393 * shot. If still dirty, it will be redirty_tail()'ed below. Update
394 * the dirty time to prevent enqueue and sync it again.
395 */
396 if ((inode->i_state & I_DIRTY) &&
397 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
398 inode->dirtied_when = jiffies;
399
400 if (wbc->pages_skipped) {
379 /* 401 /*
380 * If this inode is locked for writeback and we are not doing 402 * writeback is not making progress due to locked
381 * writeback-for-data-integrity, move it to b_more_io so that 403 * buffers. Skip this inode for now.
382 * writeback can proceed with the other inodes on s_io.
383 *
384 * We'll have another go at writing back this inode when we
385 * completed a full scan of b_io.
386 */ 404 */
387 if (wbc->sync_mode != WB_SYNC_ALL) { 405 redirty_tail(inode, wb);
406 return;
407 }
408
409 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
410 /*
411 * We didn't write back all the pages. nfs_writepages()
412 * sometimes bales out without doing anything.
413 */
414 if (wbc->nr_to_write <= 0) {
415 /* Slice used up. Queue for next turn. */
388 requeue_io(inode, wb); 416 requeue_io(inode, wb);
389 trace_writeback_single_inode_requeue(inode, wbc, 417 } else {
390 nr_to_write); 418 /*
391 return 0; 419 * Writeback blocked by something other than
420 * congestion. Delay the inode for some time to
421 * avoid spinning on the CPU (100% iowait)
422 * retrying writeback of the dirty page/inode
423 * that cannot be performed immediately.
424 */
425 redirty_tail(inode, wb);
392 } 426 }
393 427 } else if (inode->i_state & I_DIRTY) {
394 /* 428 /*
395 * It's a data-integrity sync. We must wait. 429 * Filesystems can dirty the inode during writeback operations,
430 * such as delayed allocation during submission or metadata
431 * updates after data IO completion.
396 */ 432 */
397 inode_wait_for_writeback(inode, wb); 433 redirty_tail(inode, wb);
434 } else {
435 /* The inode is clean. Remove from writeback lists. */
436 list_del_init(&inode->i_wb_list);
398 } 437 }
438}
399 439
400 BUG_ON(inode->i_state & I_SYNC); 440/*
441 * Write out an inode and its dirty pages. Do not update the writeback list
442 * linkage. That is left to the caller. The caller is also responsible for
443 * setting I_SYNC flag and calling inode_sync_complete() to clear it.
444 */
445static int
446__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
447 struct writeback_control *wbc)
448{
449 struct address_space *mapping = inode->i_mapping;
450 long nr_to_write = wbc->nr_to_write;
451 unsigned dirty;
452 int ret;
401 453
402 /* Set I_SYNC, reset I_DIRTY_PAGES */ 454 WARN_ON(!(inode->i_state & I_SYNC));
403 inode->i_state |= I_SYNC;
404 inode->i_state &= ~I_DIRTY_PAGES;
405 spin_unlock(&inode->i_lock);
406 spin_unlock(&wb->list_lock);
407 455
408 ret = do_writepages(mapping, wbc); 456 ret = do_writepages(mapping, wbc);
409 457
@@ -424,6 +472,9 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
424 * write_inode() 472 * write_inode()
425 */ 473 */
426 spin_lock(&inode->i_lock); 474 spin_lock(&inode->i_lock);
475 /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
476 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
477 inode->i_state &= ~I_DIRTY_PAGES;
427 dirty = inode->i_state & I_DIRTY; 478 dirty = inode->i_state & I_DIRTY;
428 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 479 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
429 spin_unlock(&inode->i_lock); 480 spin_unlock(&inode->i_lock);
@@ -433,60 +484,67 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
433 if (ret == 0) 484 if (ret == 0)
434 ret = err; 485 ret = err;
435 } 486 }
487 trace_writeback_single_inode(inode, wbc, nr_to_write);
488 return ret;
489}
490
491/*
492 * Write out an inode's dirty pages. Either the caller has an active reference
493 * on the inode or the inode has I_WILL_FREE set.
494 *
495 * This function is designed to be called for writing back one inode which
496 * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
497 * and does more profound writeback list handling in writeback_sb_inodes().
498 */
499static int
500writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
501 struct writeback_control *wbc)
502{
503 int ret = 0;
436 504
437 spin_lock(&wb->list_lock);
438 spin_lock(&inode->i_lock); 505 spin_lock(&inode->i_lock);
439 inode->i_state &= ~I_SYNC; 506 if (!atomic_read(&inode->i_count))
440 if (!(inode->i_state & I_FREEING)) { 507 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
508 else
509 WARN_ON(inode->i_state & I_WILL_FREE);
510
511 if (inode->i_state & I_SYNC) {
512 if (wbc->sync_mode != WB_SYNC_ALL)
513 goto out;
441 /* 514 /*
442 * Sync livelock prevention. Each inode is tagged and synced in 515 * It's a data-integrity sync. We must wait. Since callers hold
443 * one shot. If still dirty, it will be redirty_tail()'ed below. 516 * inode reference or inode has I_WILL_FREE set, it cannot go
444 * Update the dirty time to prevent enqueue and sync it again. 517 * away under us.
445 */ 518 */
446 if ((inode->i_state & I_DIRTY) && 519 __inode_wait_for_writeback(inode);
447 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
448 inode->dirtied_when = jiffies;
449
450 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
451 /*
452 * We didn't write back all the pages. nfs_writepages()
453 * sometimes bales out without doing anything.
454 */
455 inode->i_state |= I_DIRTY_PAGES;
456 if (wbc->nr_to_write <= 0) {
457 /*
458 * slice used up: queue for next turn
459 */
460 requeue_io(inode, wb);
461 } else {
462 /*
463 * Writeback blocked by something other than
464 * congestion. Delay the inode for some time to
465 * avoid spinning on the CPU (100% iowait)
466 * retrying writeback of the dirty page/inode
467 * that cannot be performed immediately.
468 */
469 redirty_tail(inode, wb);
470 }
471 } else if (inode->i_state & I_DIRTY) {
472 /*
473 * Filesystems can dirty the inode during writeback
474 * operations, such as delayed allocation during
475 * submission or metadata updates after data IO
476 * completion.
477 */
478 redirty_tail(inode, wb);
479 } else {
480 /*
481 * The inode is clean. At this point we either have
482 * a reference to the inode or it's on it's way out.
483 * No need to add it back to the LRU.
484 */
485 list_del_init(&inode->i_wb_list);
486 }
487 } 520 }
521 WARN_ON(inode->i_state & I_SYNC);
522 /*
523 * Skip inode if it is clean. We don't want to mess with writeback
524 * lists in this function since flusher thread may be doing for example
525 * sync in parallel and if we move the inode, it could get skipped. So
526 * here we make sure inode is on some writeback list and leave it there
527 * unless we have completely cleaned the inode.
528 */
529 if (!(inode->i_state & I_DIRTY))
530 goto out;
531 inode->i_state |= I_SYNC;
532 spin_unlock(&inode->i_lock);
533
534 ret = __writeback_single_inode(inode, wb, wbc);
535
536 spin_lock(&wb->list_lock);
537 spin_lock(&inode->i_lock);
538 /*
539 * If inode is clean, remove it from writeback lists. Otherwise don't
540 * touch it. See comment above for explanation.
541 */
542 if (!(inode->i_state & I_DIRTY))
543 list_del_init(&inode->i_wb_list);
544 spin_unlock(&wb->list_lock);
488 inode_sync_complete(inode); 545 inode_sync_complete(inode);
489 trace_writeback_single_inode(inode, wbc, nr_to_write); 546out:
547 spin_unlock(&inode->i_lock);
490 return ret; 548 return ret;
491} 549}
492 550
@@ -580,29 +638,57 @@ static long writeback_sb_inodes(struct super_block *sb,
580 redirty_tail(inode, wb); 638 redirty_tail(inode, wb);
581 continue; 639 continue;
582 } 640 }
583 __iget(inode); 641 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
642 /*
643 * If this inode is locked for writeback and we are not
644 * doing writeback-for-data-integrity, move it to
645 * b_more_io so that writeback can proceed with the
646 * other inodes on s_io.
647 *
648 * We'll have another go at writing back this inode
649 * when we completed a full scan of b_io.
650 */
651 spin_unlock(&inode->i_lock);
652 requeue_io(inode, wb);
653 trace_writeback_sb_inodes_requeue(inode);
654 continue;
655 }
656 spin_unlock(&wb->list_lock);
657
658 /*
659 * We already requeued the inode if it had I_SYNC set and we
660 * are doing WB_SYNC_NONE writeback. So this catches only the
661 * WB_SYNC_ALL case.
662 */
663 if (inode->i_state & I_SYNC) {
664 /* Wait for I_SYNC. This function drops i_lock... */
665 inode_sleep_on_writeback(inode);
666 /* Inode may be gone, start again */
667 continue;
668 }
669 inode->i_state |= I_SYNC;
670 spin_unlock(&inode->i_lock);
671
584 write_chunk = writeback_chunk_size(wb->bdi, work); 672 write_chunk = writeback_chunk_size(wb->bdi, work);
585 wbc.nr_to_write = write_chunk; 673 wbc.nr_to_write = write_chunk;
586 wbc.pages_skipped = 0; 674 wbc.pages_skipped = 0;
587 675
588 writeback_single_inode(inode, wb, &wbc); 676 /*
677 * We use I_SYNC to pin the inode in memory. While it is set
678 * evict_inode() will wait so the inode cannot be freed.
679 */
680 __writeback_single_inode(inode, wb, &wbc);
589 681
590 work->nr_pages -= write_chunk - wbc.nr_to_write; 682 work->nr_pages -= write_chunk - wbc.nr_to_write;
591 wrote += write_chunk - wbc.nr_to_write; 683 wrote += write_chunk - wbc.nr_to_write;
684 spin_lock(&wb->list_lock);
685 spin_lock(&inode->i_lock);
592 if (!(inode->i_state & I_DIRTY)) 686 if (!(inode->i_state & I_DIRTY))
593 wrote++; 687 wrote++;
594 if (wbc.pages_skipped) { 688 requeue_inode(inode, wb, &wbc);
595 /* 689 inode_sync_complete(inode);
596 * writeback is not making progress due to locked
597 * buffers. Skip this inode for now.
598 */
599 redirty_tail(inode, wb);
600 }
601 spin_unlock(&inode->i_lock); 690 spin_unlock(&inode->i_lock);
602 spin_unlock(&wb->list_lock); 691 cond_resched_lock(&wb->list_lock);
603 iput(inode);
604 cond_resched();
605 spin_lock(&wb->list_lock);
606 /* 692 /*
607 * bail out to wb_writeback() often enough to check 693 * bail out to wb_writeback() often enough to check
608 * background threshold and other termination conditions. 694 * background threshold and other termination conditions.
@@ -796,8 +882,10 @@ static long wb_writeback(struct bdi_writeback *wb,
796 trace_writeback_wait(wb->bdi, work); 882 trace_writeback_wait(wb->bdi, work);
797 inode = wb_inode(wb->b_more_io.prev); 883 inode = wb_inode(wb->b_more_io.prev);
798 spin_lock(&inode->i_lock); 884 spin_lock(&inode->i_lock);
799 inode_wait_for_writeback(inode, wb); 885 spin_unlock(&wb->list_lock);
800 spin_unlock(&inode->i_lock); 886 /* This function drops i_lock... */
887 inode_sleep_on_writeback(inode);
888 spin_lock(&wb->list_lock);
801 } 889 }
802 } 890 }
803 spin_unlock(&wb->list_lock); 891 spin_unlock(&wb->list_lock);
@@ -1331,7 +1419,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
1331int write_inode_now(struct inode *inode, int sync) 1419int write_inode_now(struct inode *inode, int sync)
1332{ 1420{
1333 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 1421 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1334 int ret;
1335 struct writeback_control wbc = { 1422 struct writeback_control wbc = {
1336 .nr_to_write = LONG_MAX, 1423 .nr_to_write = LONG_MAX,
1337 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 1424 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -1343,12 +1430,7 @@ int write_inode_now(struct inode *inode, int sync)
1343 wbc.nr_to_write = 0; 1430 wbc.nr_to_write = 0;
1344 1431
1345 might_sleep(); 1432 might_sleep();
1346 spin_lock(&wb->list_lock); 1433 return writeback_single_inode(inode, wb, &wbc);
1347 spin_lock(&inode->i_lock);
1348 ret = writeback_single_inode(inode, wb, &wbc);
1349 spin_unlock(&inode->i_lock);
1350 spin_unlock(&wb->list_lock);
1351 return ret;
1352} 1434}
1353EXPORT_SYMBOL(write_inode_now); 1435EXPORT_SYMBOL(write_inode_now);
1354 1436
@@ -1365,15 +1447,7 @@ EXPORT_SYMBOL(write_inode_now);
1365 */ 1447 */
1366int sync_inode(struct inode *inode, struct writeback_control *wbc) 1448int sync_inode(struct inode *inode, struct writeback_control *wbc)
1367{ 1449{
1368 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 1450 return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
1369 int ret;
1370
1371 spin_lock(&wb->list_lock);
1372 spin_lock(&inode->i_lock);
1373 ret = writeback_single_inode(inode, wb, wbc);
1374 spin_unlock(&inode->i_lock);
1375 spin_unlock(&wb->list_lock);
1376 return ret;
1377} 1451}
1378EXPORT_SYMBOL(sync_inode); 1452EXPORT_SYMBOL(sync_inode);
1379 1453