aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_sync.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c265
1 files changed, 146 insertions, 119 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index e22f0057d21f..e4f9c1b0836c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -39,6 +40,8 @@
39#include <linux/kthread.h> 40#include <linux/kthread.h>
40#include <linux/freezer.h> 41#include <linux/freezer.h>
41 42
43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
44
42/* 45/*
43 * The inode lookup is done in batches to keep the amount of lock traffic and 46 * The inode lookup is done in batches to keep the amount of lock traffic and
44 * radix tree lookups to a minimum. The batch size is a trade off between 47 * radix tree lookups to a minimum. The batch size is a trade off between
@@ -401,7 +404,7 @@ xfs_quiesce_fs(
401/* 404/*
402 * Second stage of a quiesce. The data is already synced, now we have to take 405 * Second stage of a quiesce. The data is already synced, now we have to take
403 * care of the metadata. New transactions are already blocked, so we need to 406 * care of the metadata. New transactions are already blocked, so we need to
404 * wait for any remaining transactions to drain out before proceding. 407 * wait for any remaining transactions to drain out before proceeding.
405 */ 408 */
406void 409void
407xfs_quiesce_attr( 410xfs_quiesce_attr(
@@ -425,69 +428,18 @@ xfs_quiesce_attr(
425 /* Push the superblock and write an unmount record */ 428 /* Push the superblock and write an unmount record */
426 error = xfs_log_sbcount(mp, 1); 429 error = xfs_log_sbcount(mp, 1);
427 if (error) 430 if (error)
428 xfs_fs_cmn_err(CE_WARN, mp, 431 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
429 "xfs_attr_quiesce: failed to log sb changes. "
430 "Frozen image may not be consistent."); 432 "Frozen image may not be consistent.");
431 xfs_log_unmount_write(mp); 433 xfs_log_unmount_write(mp);
432 xfs_unmountfs_writesb(mp); 434 xfs_unmountfs_writesb(mp);
433} 435}
434 436
435/* 437static void
436 * Enqueue a work item to be picked up by the vfs xfssyncd thread. 438xfs_syncd_queue_sync(
437 * Doing this has two advantages: 439 struct xfs_mount *mp)
438 * - It saves on stack space, which is tight in certain situations
439 * - It can be used (with care) as a mechanism to avoid deadlocks.
440 * Flushing while allocating in a full filesystem requires both.
441 */
442STATIC void
443xfs_syncd_queue_work(
444 struct xfs_mount *mp,
445 void *data,
446 void (*syncer)(struct xfs_mount *, void *),
447 struct completion *completion)
448{
449 struct xfs_sync_work *work;
450
451 work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
452 INIT_LIST_HEAD(&work->w_list);
453 work->w_syncer = syncer;
454 work->w_data = data;
455 work->w_mount = mp;
456 work->w_completion = completion;
457 spin_lock(&mp->m_sync_lock);
458 list_add_tail(&work->w_list, &mp->m_sync_list);
459 spin_unlock(&mp->m_sync_lock);
460 wake_up_process(mp->m_sync_task);
461}
462
463/*
464 * Flush delayed allocate data, attempting to free up reserved space
465 * from existing allocations. At this point a new allocation attempt
466 * has failed with ENOSPC and we are in the process of scratching our
467 * heads, looking about for more room...
468 */
469STATIC void
470xfs_flush_inodes_work(
471 struct xfs_mount *mp,
472 void *arg)
473{
474 struct inode *inode = arg;
475 xfs_sync_data(mp, SYNC_TRYLOCK);
476 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
477 iput(inode);
478}
479
480void
481xfs_flush_inodes(
482 xfs_inode_t *ip)
483{ 440{
484 struct inode *inode = VFS_I(ip); 441 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
485 DECLARE_COMPLETION_ONSTACK(completion); 442 msecs_to_jiffies(xfs_syncd_centisecs * 10));
486
487 igrab(inode);
488 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
489 wait_for_completion(&completion);
490 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
491} 443}
492 444
493/* 445/*
@@ -497,9 +449,10 @@ xfs_flush_inodes(
497 */ 449 */
498STATIC void 450STATIC void
499xfs_sync_worker( 451xfs_sync_worker(
500 struct xfs_mount *mp, 452 struct work_struct *work)
501 void *unused)
502{ 453{
454 struct xfs_mount *mp = container_of(to_delayed_work(work),
455 struct xfs_mount, m_sync_work);
503 int error; 456 int error;
504 457
505 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 458 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@@ -509,73 +462,106 @@ xfs_sync_worker(
509 error = xfs_fs_log_dummy(mp); 462 error = xfs_fs_log_dummy(mp);
510 else 463 else
511 xfs_log_force(mp, 0); 464 xfs_log_force(mp, 0);
512 xfs_reclaim_inodes(mp, 0);
513 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 465 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
466
467 /* start pushing all the metadata that is currently dirty */
468 xfs_ail_push_all(mp->m_ail);
514 } 469 }
515 mp->m_sync_seq++; 470
516 wake_up(&mp->m_wait_single_sync_task); 471 /* queue us up again */
472 xfs_syncd_queue_sync(mp);
517} 473}
518 474
519STATIC int 475/*
520xfssyncd( 476 * Queue a new inode reclaim pass if there are reclaimable inodes and there
521 void *arg) 477 * isn't a reclaim pass already in progress. By default it runs every 5s based
478 * on the xfs syncd work default of 30s. Perhaps this should have it's own
479 * tunable, but that can be done if this method proves to be ineffective or too
480 * aggressive.
481 */
482static void
483xfs_syncd_queue_reclaim(
484 struct xfs_mount *mp)
522{ 485{
523 struct xfs_mount *mp = arg;
524 long timeleft;
525 xfs_sync_work_t *work, *n;
526 LIST_HEAD (tmp);
527
528 set_freezable();
529 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
530 for (;;) {
531 if (list_empty(&mp->m_sync_list))
532 timeleft = schedule_timeout_interruptible(timeleft);
533 /* swsusp */
534 try_to_freeze();
535 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
536 break;
537 486
538 spin_lock(&mp->m_sync_lock); 487 /*
539 /* 488 * We can have inodes enter reclaim after we've shut down the syncd
540 * We can get woken by laptop mode, to do a sync - 489 * workqueue during unmount, so don't allow reclaim work to be queued
541 * that's the (only!) case where the list would be 490 * during unmount.
542 * empty with time remaining. 491 */
543 */ 492 if (!(mp->m_super->s_flags & MS_ACTIVE))
544 if (!timeleft || list_empty(&mp->m_sync_list)) { 493 return;
545 if (!timeleft)
546 timeleft = xfs_syncd_centisecs *
547 msecs_to_jiffies(10);
548 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
549 list_add_tail(&mp->m_sync_work.w_list,
550 &mp->m_sync_list);
551 }
552 list_splice_init(&mp->m_sync_list, &tmp);
553 spin_unlock(&mp->m_sync_lock);
554 494
555 list_for_each_entry_safe(work, n, &tmp, w_list) { 495 rcu_read_lock();
556 (*work->w_syncer)(mp, work->w_data); 496 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
557 list_del(&work->w_list); 497 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
558 if (work == &mp->m_sync_work) 498 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
559 continue;
560 if (work->w_completion)
561 complete(work->w_completion);
562 kmem_free(work);
563 }
564 } 499 }
500 rcu_read_unlock();
501}
565 502
566 return 0; 503/*
504 * This is a fast pass over the inode cache to try to get reclaim moving on as
505 * many inodes as possible in a short period of time. It kicks itself every few
506 * seconds, as well as being kicked by the inode cache shrinker when memory
507 * goes low. It scans as quickly as possible avoiding locked inodes or those
508 * already being flushed, and once done schedules a future pass.
509 */
510STATIC void
511xfs_reclaim_worker(
512 struct work_struct *work)
513{
514 struct xfs_mount *mp = container_of(to_delayed_work(work),
515 struct xfs_mount, m_reclaim_work);
516
517 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
518 xfs_syncd_queue_reclaim(mp);
519}
520
521/*
522 * Flush delayed allocate data, attempting to free up reserved space
523 * from existing allocations. At this point a new allocation attempt
524 * has failed with ENOSPC and we are in the process of scratching our
525 * heads, looking about for more room.
526 *
527 * Queue a new data flush if there isn't one already in progress and
528 * wait for completion of the flush. This means that we only ever have one
529 * inode flush in progress no matter how many ENOSPC events are occurring and
530 * so will prevent the system from bogging down due to every concurrent
531 * ENOSPC event scanning all the active inodes in the system for writeback.
532 */
533void
534xfs_flush_inodes(
535 struct xfs_inode *ip)
536{
537 struct xfs_mount *mp = ip->i_mount;
538
539 queue_work(xfs_syncd_wq, &mp->m_flush_work);
540 flush_work_sync(&mp->m_flush_work);
541}
542
543STATIC void
544xfs_flush_worker(
545 struct work_struct *work)
546{
547 struct xfs_mount *mp = container_of(work,
548 struct xfs_mount, m_flush_work);
549
550 xfs_sync_data(mp, SYNC_TRYLOCK);
551 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
567} 552}
568 553
569int 554int
570xfs_syncd_init( 555xfs_syncd_init(
571 struct xfs_mount *mp) 556 struct xfs_mount *mp)
572{ 557{
573 mp->m_sync_work.w_syncer = xfs_sync_worker; 558 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
574 mp->m_sync_work.w_mount = mp; 559 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
575 mp->m_sync_work.w_completion = NULL; 560 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
576 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); 561
577 if (IS_ERR(mp->m_sync_task)) 562 xfs_syncd_queue_sync(mp);
578 return -PTR_ERR(mp->m_sync_task); 563 xfs_syncd_queue_reclaim(mp);
564
579 return 0; 565 return 0;
580} 566}
581 567
@@ -583,7 +569,9 @@ void
583xfs_syncd_stop( 569xfs_syncd_stop(
584 struct xfs_mount *mp) 570 struct xfs_mount *mp)
585{ 571{
586 kthread_stop(mp->m_sync_task); 572 cancel_delayed_work_sync(&mp->m_sync_work);
573 cancel_delayed_work_sync(&mp->m_reclaim_work);
574 cancel_work_sync(&mp->m_flush_work);
587} 575}
588 576
589void 577void
@@ -602,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
602 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 590 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
603 XFS_ICI_RECLAIM_TAG); 591 XFS_ICI_RECLAIM_TAG);
604 spin_unlock(&ip->i_mount->m_perag_lock); 592 spin_unlock(&ip->i_mount->m_perag_lock);
593
594 /* schedule periodic background inode reclaim */
595 xfs_syncd_queue_reclaim(ip->i_mount);
596
605 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 597 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
606 -1, _RET_IP_); 598 -1, _RET_IP_);
607 } 599 }
@@ -762,8 +754,10 @@ xfs_reclaim_inode(
762 struct xfs_perag *pag, 754 struct xfs_perag *pag,
763 int sync_mode) 755 int sync_mode)
764{ 756{
765 int error = 0; 757 int error;
766 758
759restart:
760 error = 0;
767 xfs_ilock(ip, XFS_ILOCK_EXCL); 761 xfs_ilock(ip, XFS_ILOCK_EXCL);
768 if (!xfs_iflock_nowait(ip)) { 762 if (!xfs_iflock_nowait(ip)) {
769 if (!(sync_mode & SYNC_WAIT)) 763 if (!(sync_mode & SYNC_WAIT))
@@ -789,9 +783,31 @@ xfs_reclaim_inode(
789 if (xfs_inode_clean(ip)) 783 if (xfs_inode_clean(ip))
790 goto reclaim; 784 goto reclaim;
791 785
792 /* Now we have an inode that needs flushing */ 786 /*
793 error = xfs_iflush(ip, sync_mode); 787 * Now we have an inode that needs flushing.
788 *
789 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
790 * reclaim as we can deadlock with inode cluster removal.
791 * xfs_ifree_cluster() can lock the inode buffer before it locks the
792 * ip->i_lock, and we are doing the exact opposite here. As a result,
793 * doing a blocking xfs_itobp() to get the cluster buffer will result
794 * in an ABBA deadlock with xfs_ifree_cluster().
795 *
796 * As xfs_ifree_cluser() must gather all inodes that are active in the
797 * cache to mark them stale, if we hit this case we don't actually want
798 * to do IO here - we want the inode marked stale so we can simply
799 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
800 * just unlock the inode, back off and try again. Hopefully the next
801 * pass through will see the stale flag set on the inode.
802 */
803 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
794 if (sync_mode & SYNC_WAIT) { 804 if (sync_mode & SYNC_WAIT) {
805 if (error == EAGAIN) {
806 xfs_iunlock(ip, XFS_ILOCK_EXCL);
807 /* backoff longer than in xfs_ifree_cluster */
808 delay(2);
809 goto restart;
810 }
795 xfs_iflock(ip); 811 xfs_iflock(ip);
796 goto reclaim; 812 goto reclaim;
797 } 813 }
@@ -806,7 +822,7 @@ xfs_reclaim_inode(
806 * pass on the error. 822 * pass on the error.
807 */ 823 */
808 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 824 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
809 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 825 xfs_warn(ip->i_mount,
810 "inode 0x%llx background reclaim flush failed with %d", 826 "inode 0x%llx background reclaim flush failed with %d",
811 (long long)ip->i_ino, error); 827 (long long)ip->i_ino, error);
812 } 828 }
@@ -994,7 +1010,13 @@ xfs_reclaim_inodes(
994} 1010}
995 1011
996/* 1012/*
997 * Shrinker infrastructure. 1013 * Inode cache shrinker.
1014 *
1015 * When called we make sure that there is a background (fast) inode reclaim in
1016 * progress, while we will throttle the speed of reclaim via doiing synchronous
1017 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1018 * them to be cleaned, which we hope will not be very long due to the
1019 * background walker having already kicked the IO off on those dirty inodes.
998 */ 1020 */
999static int 1021static int
1000xfs_reclaim_inode_shrink( 1022xfs_reclaim_inode_shrink(
@@ -1009,10 +1031,15 @@ xfs_reclaim_inode_shrink(
1009 1031
1010 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1032 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
1011 if (nr_to_scan) { 1033 if (nr_to_scan) {
1034 /* kick background reclaimer and push the AIL */
1035 xfs_syncd_queue_reclaim(mp);
1036 xfs_ail_push_all(mp->m_ail);
1037
1012 if (!(gfp_mask & __GFP_FS)) 1038 if (!(gfp_mask & __GFP_FS))
1013 return -1; 1039 return -1;
1014 1040
1015 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); 1041 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
1042 &nr_to_scan);
1016 /* terminate if we don't exhaust the scan */ 1043 /* terminate if we don't exhaust the scan */
1017 if (nr_to_scan > 0) 1044 if (nr_to_scan > 0)
1018 return -1; 1045 return -1;