diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 336 |
1 files changed, 205 insertions, 131 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 539f36cf3e4a..8d2fb8c88cf3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -231,11 +231,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) | |||
231 | 231 | ||
232 | static void inode_sync_complete(struct inode *inode) | 232 | static void inode_sync_complete(struct inode *inode) |
233 | { | 233 | { |
234 | /* | 234 | inode->i_state &= ~I_SYNC; |
235 | * Prevent speculative execution through | 235 | /* Waiters must see I_SYNC cleared before being woken up */ |
236 | * spin_unlock(&wb->list_lock); | ||
237 | */ | ||
238 | |||
239 | smp_mb(); | 236 | smp_mb(); |
240 | wake_up_bit(&inode->i_state, __I_SYNC); | 237 | wake_up_bit(&inode->i_state, __I_SYNC); |
241 | } | 238 | } |
@@ -329,10 +326,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) | |||
329 | } | 326 | } |
330 | 327 | ||
331 | /* | 328 | /* |
332 | * Wait for writeback on an inode to complete. | 329 | * Wait for writeback on an inode to complete. Called with i_lock held. |
330 | * Caller must make sure inode cannot go away when we drop i_lock. | ||
333 | */ | 331 | */ |
334 | static void inode_wait_for_writeback(struct inode *inode, | 332 | static void __inode_wait_for_writeback(struct inode *inode) |
335 | struct bdi_writeback *wb) | 333 | __releases(inode->i_lock) |
334 | __acquires(inode->i_lock) | ||
336 | { | 335 | { |
337 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); | 336 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); |
338 | wait_queue_head_t *wqh; | 337 | wait_queue_head_t *wqh; |
@@ -340,70 +339,119 @@ static void inode_wait_for_writeback(struct inode *inode, | |||
340 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); | 339 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
341 | while (inode->i_state & I_SYNC) { | 340 | while (inode->i_state & I_SYNC) { |
342 | spin_unlock(&inode->i_lock); | 341 | spin_unlock(&inode->i_lock); |
343 | spin_unlock(&wb->list_lock); | ||
344 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); | 342 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
345 | spin_lock(&wb->list_lock); | ||
346 | spin_lock(&inode->i_lock); | 343 | spin_lock(&inode->i_lock); |
347 | } | 344 | } |
348 | } | 345 | } |
349 | 346 | ||
350 | /* | 347 | /* |
351 | * Write out an inode's dirty pages. Called under wb->list_lock and | 348 | * Wait for writeback on an inode to complete. Caller must have inode pinned. |
352 | * inode->i_lock. Either the caller has an active reference on the inode or | ||
353 | * the inode has I_WILL_FREE set. | ||
354 | * | ||
355 | * If `wait' is set, wait on the writeout. | ||
356 | * | ||
357 | * The whole writeout design is quite complex and fragile. We want to avoid | ||
358 | * starvation of particular inodes when others are being redirtied, prevent | ||
359 | * livelocks, etc. | ||
360 | */ | 349 | */ |
361 | static int | 350 | void inode_wait_for_writeback(struct inode *inode) |
362 | writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | ||
363 | struct writeback_control *wbc) | ||
364 | { | 351 | { |
365 | struct address_space *mapping = inode->i_mapping; | 352 | spin_lock(&inode->i_lock); |
366 | long nr_to_write = wbc->nr_to_write; | 353 | __inode_wait_for_writeback(inode); |
367 | unsigned dirty; | 354 | spin_unlock(&inode->i_lock); |
368 | int ret; | 355 | } |
369 | 356 | ||
370 | assert_spin_locked(&wb->list_lock); | 357 | /* |
371 | assert_spin_locked(&inode->i_lock); | 358 | * Sleep until I_SYNC is cleared. This function must be called with i_lock |
359 | * held and drops it. It is aimed for callers not holding any inode reference | ||
360 | * so once i_lock is dropped, inode can go away. | ||
361 | */ | ||
362 | static void inode_sleep_on_writeback(struct inode *inode) | ||
363 | __releases(inode->i_lock) | ||
364 | { | ||
365 | DEFINE_WAIT(wait); | ||
366 | wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); | ||
367 | int sleep; | ||
372 | 368 | ||
373 | if (!atomic_read(&inode->i_count)) | 369 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); |
374 | WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); | 370 | sleep = inode->i_state & I_SYNC; |
375 | else | 371 | spin_unlock(&inode->i_lock); |
376 | WARN_ON(inode->i_state & I_WILL_FREE); | 372 | if (sleep) |
373 | schedule(); | ||
374 | finish_wait(wqh, &wait); | ||
375 | } | ||
377 | 376 | ||
378 | if (inode->i_state & I_SYNC) { | 377 | /* |
378 | * Find proper writeback list for the inode depending on its current state and | ||
379 | * possibly also change of its state while we were doing writeback. Here we | ||
380 | * handle things such as livelock prevention or fairness of writeback among | ||
381 | * inodes. This function can be called only by flusher thread - noone else | ||
382 | * processes all inodes in writeback lists and requeueing inodes behind flusher | ||
383 | * thread's back can have unexpected consequences. | ||
384 | */ | ||
385 | static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, | ||
386 | struct writeback_control *wbc) | ||
387 | { | ||
388 | if (inode->i_state & I_FREEING) | ||
389 | return; | ||
390 | |||
391 | /* | ||
392 | * Sync livelock prevention. Each inode is tagged and synced in one | ||
393 | * shot. If still dirty, it will be redirty_tail()'ed below. Update | ||
394 | * the dirty time to prevent enqueue and sync it again. | ||
395 | */ | ||
396 | if ((inode->i_state & I_DIRTY) && | ||
397 | (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) | ||
398 | inode->dirtied_when = jiffies; | ||
399 | |||
400 | if (wbc->pages_skipped) { | ||
379 | /* | 401 | /* |
380 | * If this inode is locked for writeback and we are not doing | 402 | * writeback is not making progress due to locked |
381 | * writeback-for-data-integrity, move it to b_more_io so that | 403 | * buffers. Skip this inode for now. |
382 | * writeback can proceed with the other inodes on s_io. | ||
383 | * | ||
384 | * We'll have another go at writing back this inode when we | ||
385 | * completed a full scan of b_io. | ||
386 | */ | 404 | */ |
387 | if (wbc->sync_mode != WB_SYNC_ALL) { | 405 | redirty_tail(inode, wb); |
406 | return; | ||
407 | } | ||
408 | |||
409 | if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { | ||
410 | /* | ||
411 | * We didn't write back all the pages. nfs_writepages() | ||
412 | * sometimes bales out without doing anything. | ||
413 | */ | ||
414 | if (wbc->nr_to_write <= 0) { | ||
415 | /* Slice used up. Queue for next turn. */ | ||
388 | requeue_io(inode, wb); | 416 | requeue_io(inode, wb); |
389 | trace_writeback_single_inode_requeue(inode, wbc, | 417 | } else { |
390 | nr_to_write); | 418 | /* |
391 | return 0; | 419 | * Writeback blocked by something other than |
420 | * congestion. Delay the inode for some time to | ||
421 | * avoid spinning on the CPU (100% iowait) | ||
422 | * retrying writeback of the dirty page/inode | ||
423 | * that cannot be performed immediately. | ||
424 | */ | ||
425 | redirty_tail(inode, wb); | ||
392 | } | 426 | } |
393 | 427 | } else if (inode->i_state & I_DIRTY) { | |
394 | /* | 428 | /* |
395 | * It's a data-integrity sync. We must wait. | 429 | * Filesystems can dirty the inode during writeback operations, |
430 | * such as delayed allocation during submission or metadata | ||
431 | * updates after data IO completion. | ||
396 | */ | 432 | */ |
397 | inode_wait_for_writeback(inode, wb); | 433 | redirty_tail(inode, wb); |
434 | } else { | ||
435 | /* The inode is clean. Remove from writeback lists. */ | ||
436 | list_del_init(&inode->i_wb_list); | ||
398 | } | 437 | } |
438 | } | ||
399 | 439 | ||
400 | BUG_ON(inode->i_state & I_SYNC); | 440 | /* |
441 | * Write out an inode and its dirty pages. Do not update the writeback list | ||
442 | * linkage. That is left to the caller. The caller is also responsible for | ||
443 | * setting I_SYNC flag and calling inode_sync_complete() to clear it. | ||
444 | */ | ||
445 | static int | ||
446 | __writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | ||
447 | struct writeback_control *wbc) | ||
448 | { | ||
449 | struct address_space *mapping = inode->i_mapping; | ||
450 | long nr_to_write = wbc->nr_to_write; | ||
451 | unsigned dirty; | ||
452 | int ret; | ||
401 | 453 | ||
402 | /* Set I_SYNC, reset I_DIRTY_PAGES */ | 454 | WARN_ON(!(inode->i_state & I_SYNC)); |
403 | inode->i_state |= I_SYNC; | ||
404 | inode->i_state &= ~I_DIRTY_PAGES; | ||
405 | spin_unlock(&inode->i_lock); | ||
406 | spin_unlock(&wb->list_lock); | ||
407 | 455 | ||
408 | ret = do_writepages(mapping, wbc); | 456 | ret = do_writepages(mapping, wbc); |
409 | 457 | ||
@@ -424,6 +472,9 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | |||
424 | * write_inode() | 472 | * write_inode() |
425 | */ | 473 | */ |
426 | spin_lock(&inode->i_lock); | 474 | spin_lock(&inode->i_lock); |
475 | /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ | ||
476 | if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) | ||
477 | inode->i_state &= ~I_DIRTY_PAGES; | ||
427 | dirty = inode->i_state & I_DIRTY; | 478 | dirty = inode->i_state & I_DIRTY; |
428 | inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); | 479 | inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); |
429 | spin_unlock(&inode->i_lock); | 480 | spin_unlock(&inode->i_lock); |
@@ -433,60 +484,67 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | |||
433 | if (ret == 0) | 484 | if (ret == 0) |
434 | ret = err; | 485 | ret = err; |
435 | } | 486 | } |
487 | trace_writeback_single_inode(inode, wbc, nr_to_write); | ||
488 | return ret; | ||
489 | } | ||
490 | |||
491 | /* | ||
492 | * Write out an inode's dirty pages. Either the caller has an active reference | ||
493 | * on the inode or the inode has I_WILL_FREE set. | ||
494 | * | ||
495 | * This function is designed to be called for writing back one inode which | ||
496 | * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() | ||
497 | * and does more profound writeback list handling in writeback_sb_inodes(). | ||
498 | */ | ||
499 | static int | ||
500 | writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | ||
501 | struct writeback_control *wbc) | ||
502 | { | ||
503 | int ret = 0; | ||
436 | 504 | ||
437 | spin_lock(&wb->list_lock); | ||
438 | spin_lock(&inode->i_lock); | 505 | spin_lock(&inode->i_lock); |
439 | inode->i_state &= ~I_SYNC; | 506 | if (!atomic_read(&inode->i_count)) |
440 | if (!(inode->i_state & I_FREEING)) { | 507 | WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); |
508 | else | ||
509 | WARN_ON(inode->i_state & I_WILL_FREE); | ||
510 | |||
511 | if (inode->i_state & I_SYNC) { | ||
512 | if (wbc->sync_mode != WB_SYNC_ALL) | ||
513 | goto out; | ||
441 | /* | 514 | /* |
442 | * Sync livelock prevention. Each inode is tagged and synced in | 515 | * It's a data-integrity sync. We must wait. Since callers hold |
443 | * one shot. If still dirty, it will be redirty_tail()'ed below. | 516 | * inode reference or inode has I_WILL_FREE set, it cannot go |
444 | * Update the dirty time to prevent enqueue and sync it again. | 517 | * away under us. |
445 | */ | 518 | */ |
446 | if ((inode->i_state & I_DIRTY) && | 519 | __inode_wait_for_writeback(inode); |
447 | (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) | ||
448 | inode->dirtied_when = jiffies; | ||
449 | |||
450 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | ||
451 | /* | ||
452 | * We didn't write back all the pages. nfs_writepages() | ||
453 | * sometimes bales out without doing anything. | ||
454 | */ | ||
455 | inode->i_state |= I_DIRTY_PAGES; | ||
456 | if (wbc->nr_to_write <= 0) { | ||
457 | /* | ||
458 | * slice used up: queue for next turn | ||
459 | */ | ||
460 | requeue_io(inode, wb); | ||
461 | } else { | ||
462 | /* | ||
463 | * Writeback blocked by something other than | ||
464 | * congestion. Delay the inode for some time to | ||
465 | * avoid spinning on the CPU (100% iowait) | ||
466 | * retrying writeback of the dirty page/inode | ||
467 | * that cannot be performed immediately. | ||
468 | */ | ||
469 | redirty_tail(inode, wb); | ||
470 | } | ||
471 | } else if (inode->i_state & I_DIRTY) { | ||
472 | /* | ||
473 | * Filesystems can dirty the inode during writeback | ||
474 | * operations, such as delayed allocation during | ||
475 | * submission or metadata updates after data IO | ||
476 | * completion. | ||
477 | */ | ||
478 | redirty_tail(inode, wb); | ||
479 | } else { | ||
480 | /* | ||
481 | * The inode is clean. At this point we either have | ||
482 | * a reference to the inode or it's on it's way out. | ||
483 | * No need to add it back to the LRU. | ||
484 | */ | ||
485 | list_del_init(&inode->i_wb_list); | ||
486 | } | ||
487 | } | 520 | } |
521 | WARN_ON(inode->i_state & I_SYNC); | ||
522 | /* | ||
523 | * Skip inode if it is clean. We don't want to mess with writeback | ||
524 | * lists in this function since flusher thread may be doing for example | ||
525 | * sync in parallel and if we move the inode, it could get skipped. So | ||
526 | * here we make sure inode is on some writeback list and leave it there | ||
527 | * unless we have completely cleaned the inode. | ||
528 | */ | ||
529 | if (!(inode->i_state & I_DIRTY)) | ||
530 | goto out; | ||
531 | inode->i_state |= I_SYNC; | ||
532 | spin_unlock(&inode->i_lock); | ||
533 | |||
534 | ret = __writeback_single_inode(inode, wb, wbc); | ||
535 | |||
536 | spin_lock(&wb->list_lock); | ||
537 | spin_lock(&inode->i_lock); | ||
538 | /* | ||
539 | * If inode is clean, remove it from writeback lists. Otherwise don't | ||
540 | * touch it. See comment above for explanation. | ||
541 | */ | ||
542 | if (!(inode->i_state & I_DIRTY)) | ||
543 | list_del_init(&inode->i_wb_list); | ||
544 | spin_unlock(&wb->list_lock); | ||
488 | inode_sync_complete(inode); | 545 | inode_sync_complete(inode); |
489 | trace_writeback_single_inode(inode, wbc, nr_to_write); | 546 | out: |
547 | spin_unlock(&inode->i_lock); | ||
490 | return ret; | 548 | return ret; |
491 | } | 549 | } |
492 | 550 | ||
@@ -580,29 +638,57 @@ static long writeback_sb_inodes(struct super_block *sb, | |||
580 | redirty_tail(inode, wb); | 638 | redirty_tail(inode, wb); |
581 | continue; | 639 | continue; |
582 | } | 640 | } |
583 | __iget(inode); | 641 | if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { |
642 | /* | ||
643 | * If this inode is locked for writeback and we are not | ||
644 | * doing writeback-for-data-integrity, move it to | ||
645 | * b_more_io so that writeback can proceed with the | ||
646 | * other inodes on s_io. | ||
647 | * | ||
648 | * We'll have another go at writing back this inode | ||
649 | * when we completed a full scan of b_io. | ||
650 | */ | ||
651 | spin_unlock(&inode->i_lock); | ||
652 | requeue_io(inode, wb); | ||
653 | trace_writeback_sb_inodes_requeue(inode); | ||
654 | continue; | ||
655 | } | ||
656 | spin_unlock(&wb->list_lock); | ||
657 | |||
658 | /* | ||
659 | * We already requeued the inode if it had I_SYNC set and we | ||
660 | * are doing WB_SYNC_NONE writeback. So this catches only the | ||
661 | * WB_SYNC_ALL case. | ||
662 | */ | ||
663 | if (inode->i_state & I_SYNC) { | ||
664 | /* Wait for I_SYNC. This function drops i_lock... */ | ||
665 | inode_sleep_on_writeback(inode); | ||
666 | /* Inode may be gone, start again */ | ||
667 | continue; | ||
668 | } | ||
669 | inode->i_state |= I_SYNC; | ||
670 | spin_unlock(&inode->i_lock); | ||
671 | |||
584 | write_chunk = writeback_chunk_size(wb->bdi, work); | 672 | write_chunk = writeback_chunk_size(wb->bdi, work); |
585 | wbc.nr_to_write = write_chunk; | 673 | wbc.nr_to_write = write_chunk; |
586 | wbc.pages_skipped = 0; | 674 | wbc.pages_skipped = 0; |
587 | 675 | ||
588 | writeback_single_inode(inode, wb, &wbc); | 676 | /* |
677 | * We use I_SYNC to pin the inode in memory. While it is set | ||
678 | * evict_inode() will wait so the inode cannot be freed. | ||
679 | */ | ||
680 | __writeback_single_inode(inode, wb, &wbc); | ||
589 | 681 | ||
590 | work->nr_pages -= write_chunk - wbc.nr_to_write; | 682 | work->nr_pages -= write_chunk - wbc.nr_to_write; |
591 | wrote += write_chunk - wbc.nr_to_write; | 683 | wrote += write_chunk - wbc.nr_to_write; |
684 | spin_lock(&wb->list_lock); | ||
685 | spin_lock(&inode->i_lock); | ||
592 | if (!(inode->i_state & I_DIRTY)) | 686 | if (!(inode->i_state & I_DIRTY)) |
593 | wrote++; | 687 | wrote++; |
594 | if (wbc.pages_skipped) { | 688 | requeue_inode(inode, wb, &wbc); |
595 | /* | 689 | inode_sync_complete(inode); |
596 | * writeback is not making progress due to locked | ||
597 | * buffers. Skip this inode for now. | ||
598 | */ | ||
599 | redirty_tail(inode, wb); | ||
600 | } | ||
601 | spin_unlock(&inode->i_lock); | 690 | spin_unlock(&inode->i_lock); |
602 | spin_unlock(&wb->list_lock); | 691 | cond_resched_lock(&wb->list_lock); |
603 | iput(inode); | ||
604 | cond_resched(); | ||
605 | spin_lock(&wb->list_lock); | ||
606 | /* | 692 | /* |
607 | * bail out to wb_writeback() often enough to check | 693 | * bail out to wb_writeback() often enough to check |
608 | * background threshold and other termination conditions. | 694 | * background threshold and other termination conditions. |
@@ -796,8 +882,10 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
796 | trace_writeback_wait(wb->bdi, work); | 882 | trace_writeback_wait(wb->bdi, work); |
797 | inode = wb_inode(wb->b_more_io.prev); | 883 | inode = wb_inode(wb->b_more_io.prev); |
798 | spin_lock(&inode->i_lock); | 884 | spin_lock(&inode->i_lock); |
799 | inode_wait_for_writeback(inode, wb); | 885 | spin_unlock(&wb->list_lock); |
800 | spin_unlock(&inode->i_lock); | 886 | /* This function drops i_lock... */ |
887 | inode_sleep_on_writeback(inode); | ||
888 | spin_lock(&wb->list_lock); | ||
801 | } | 889 | } |
802 | } | 890 | } |
803 | spin_unlock(&wb->list_lock); | 891 | spin_unlock(&wb->list_lock); |
@@ -1331,7 +1419,6 @@ EXPORT_SYMBOL(sync_inodes_sb); | |||
1331 | int write_inode_now(struct inode *inode, int sync) | 1419 | int write_inode_now(struct inode *inode, int sync) |
1332 | { | 1420 | { |
1333 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 1421 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
1334 | int ret; | ||
1335 | struct writeback_control wbc = { | 1422 | struct writeback_control wbc = { |
1336 | .nr_to_write = LONG_MAX, | 1423 | .nr_to_write = LONG_MAX, |
1337 | .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, | 1424 | .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, |
@@ -1343,12 +1430,7 @@ int write_inode_now(struct inode *inode, int sync) | |||
1343 | wbc.nr_to_write = 0; | 1430 | wbc.nr_to_write = 0; |
1344 | 1431 | ||
1345 | might_sleep(); | 1432 | might_sleep(); |
1346 | spin_lock(&wb->list_lock); | 1433 | return writeback_single_inode(inode, wb, &wbc); |
1347 | spin_lock(&inode->i_lock); | ||
1348 | ret = writeback_single_inode(inode, wb, &wbc); | ||
1349 | spin_unlock(&inode->i_lock); | ||
1350 | spin_unlock(&wb->list_lock); | ||
1351 | return ret; | ||
1352 | } | 1434 | } |
1353 | EXPORT_SYMBOL(write_inode_now); | 1435 | EXPORT_SYMBOL(write_inode_now); |
1354 | 1436 | ||
@@ -1365,15 +1447,7 @@ EXPORT_SYMBOL(write_inode_now); | |||
1365 | */ | 1447 | */ |
1366 | int sync_inode(struct inode *inode, struct writeback_control *wbc) | 1448 | int sync_inode(struct inode *inode, struct writeback_control *wbc) |
1367 | { | 1449 | { |
1368 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 1450 | return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); |
1369 | int ret; | ||
1370 | |||
1371 | spin_lock(&wb->list_lock); | ||
1372 | spin_lock(&inode->i_lock); | ||
1373 | ret = writeback_single_inode(inode, wb, wbc); | ||
1374 | spin_unlock(&inode->i_lock); | ||
1375 | spin_unlock(&wb->list_lock); | ||
1376 | return ret; | ||
1377 | } | 1451 | } |
1378 | EXPORT_SYMBOL(sync_inode); | 1452 | EXPORT_SYMBOL(sync_inode); |
1379 | 1453 | ||