aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_sync.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c230
1 files changed, 117 insertions, 113 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 594cd822d84d..e4f9c1b0836c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -39,6 +40,8 @@
39#include <linux/kthread.h> 40#include <linux/kthread.h>
40#include <linux/freezer.h> 41#include <linux/freezer.h>
41 42
43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
44
42/* 45/*
43 * The inode lookup is done in batches to keep the amount of lock traffic and 46 * The inode lookup is done in batches to keep the amount of lock traffic and
44 * radix tree lookups to a minimum. The batch size is a trade off between 47 * radix tree lookups to a minimum. The batch size is a trade off between
@@ -401,7 +404,7 @@ xfs_quiesce_fs(
401/* 404/*
402 * Second stage of a quiesce. The data is already synced, now we have to take 405 * Second stage of a quiesce. The data is already synced, now we have to take
403 * care of the metadata. New transactions are already blocked, so we need to 406 * care of the metadata. New transactions are already blocked, so we need to
404 * wait for any remaining transactions to drain out before proceding. 407 * wait for any remaining transactions to drain out before proceeding.
405 */ 408 */
406void 409void
407xfs_quiesce_attr( 410xfs_quiesce_attr(
@@ -431,62 +434,12 @@ xfs_quiesce_attr(
431 xfs_unmountfs_writesb(mp); 434 xfs_unmountfs_writesb(mp);
432} 435}
433 436
434/* 437static void
435 * Enqueue a work item to be picked up by the vfs xfssyncd thread. 438xfs_syncd_queue_sync(
436 * Doing this has two advantages: 439 struct xfs_mount *mp)
437 * - It saves on stack space, which is tight in certain situations
438 * - It can be used (with care) as a mechanism to avoid deadlocks.
439 * Flushing while allocating in a full filesystem requires both.
440 */
441STATIC void
442xfs_syncd_queue_work(
443 struct xfs_mount *mp,
444 void *data,
445 void (*syncer)(struct xfs_mount *, void *),
446 struct completion *completion)
447{
448 struct xfs_sync_work *work;
449
450 work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
451 INIT_LIST_HEAD(&work->w_list);
452 work->w_syncer = syncer;
453 work->w_data = data;
454 work->w_mount = mp;
455 work->w_completion = completion;
456 spin_lock(&mp->m_sync_lock);
457 list_add_tail(&work->w_list, &mp->m_sync_list);
458 spin_unlock(&mp->m_sync_lock);
459 wake_up_process(mp->m_sync_task);
460}
461
462/*
463 * Flush delayed allocate data, attempting to free up reserved space
464 * from existing allocations. At this point a new allocation attempt
465 * has failed with ENOSPC and we are in the process of scratching our
466 * heads, looking about for more room...
467 */
468STATIC void
469xfs_flush_inodes_work(
470 struct xfs_mount *mp,
471 void *arg)
472{
473 struct inode *inode = arg;
474 xfs_sync_data(mp, SYNC_TRYLOCK);
475 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
476 iput(inode);
477}
478
479void
480xfs_flush_inodes(
481 xfs_inode_t *ip)
482{ 440{
483 struct inode *inode = VFS_I(ip); 441 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
484 DECLARE_COMPLETION_ONSTACK(completion); 442 msecs_to_jiffies(xfs_syncd_centisecs * 10));
485
486 igrab(inode);
487 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
488 wait_for_completion(&completion);
489 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
490} 443}
491 444
492/* 445/*
@@ -496,9 +449,10 @@ xfs_flush_inodes(
496 */ 449 */
497STATIC void 450STATIC void
498xfs_sync_worker( 451xfs_sync_worker(
499 struct xfs_mount *mp, 452 struct work_struct *work)
500 void *unused)
501{ 453{
454 struct xfs_mount *mp = container_of(to_delayed_work(work),
455 struct xfs_mount, m_sync_work);
502 int error; 456 int error;
503 457
504 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 458 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@@ -508,73 +462,106 @@ xfs_sync_worker(
508 error = xfs_fs_log_dummy(mp); 462 error = xfs_fs_log_dummy(mp);
509 else 463 else
510 xfs_log_force(mp, 0); 464 xfs_log_force(mp, 0);
511 xfs_reclaim_inodes(mp, 0);
512 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 465 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
466
467 /* start pushing all the metadata that is currently dirty */
468 xfs_ail_push_all(mp->m_ail);
513 } 469 }
514 mp->m_sync_seq++; 470
515 wake_up(&mp->m_wait_single_sync_task); 471 /* queue us up again */
472 xfs_syncd_queue_sync(mp);
516} 473}
517 474
518STATIC int 475/*
519xfssyncd( 476 * Queue a new inode reclaim pass if there are reclaimable inodes and there
520 void *arg) 477 * isn't a reclaim pass already in progress. By default it runs every 5s based
478 * on the xfs syncd work default of 30s. Perhaps this should have it's own
479 * tunable, but that can be done if this method proves to be ineffective or too
480 * aggressive.
481 */
482static void
483xfs_syncd_queue_reclaim(
484 struct xfs_mount *mp)
521{ 485{
522 struct xfs_mount *mp = arg;
523 long timeleft;
524 xfs_sync_work_t *work, *n;
525 LIST_HEAD (tmp);
526
527 set_freezable();
528 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
529 for (;;) {
530 if (list_empty(&mp->m_sync_list))
531 timeleft = schedule_timeout_interruptible(timeleft);
532 /* swsusp */
533 try_to_freeze();
534 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
535 break;
536 486
537 spin_lock(&mp->m_sync_lock); 487 /*
538 /* 488 * We can have inodes enter reclaim after we've shut down the syncd
539 * We can get woken by laptop mode, to do a sync - 489 * workqueue during unmount, so don't allow reclaim work to be queued
540 * that's the (only!) case where the list would be 490 * during unmount.
541 * empty with time remaining. 491 */
542 */ 492 if (!(mp->m_super->s_flags & MS_ACTIVE))
543 if (!timeleft || list_empty(&mp->m_sync_list)) { 493 return;
544 if (!timeleft)
545 timeleft = xfs_syncd_centisecs *
546 msecs_to_jiffies(10);
547 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
548 list_add_tail(&mp->m_sync_work.w_list,
549 &mp->m_sync_list);
550 }
551 list_splice_init(&mp->m_sync_list, &tmp);
552 spin_unlock(&mp->m_sync_lock);
553 494
554 list_for_each_entry_safe(work, n, &tmp, w_list) { 495 rcu_read_lock();
555 (*work->w_syncer)(mp, work->w_data); 496 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
556 list_del(&work->w_list); 497 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
557 if (work == &mp->m_sync_work) 498 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
558 continue;
559 if (work->w_completion)
560 complete(work->w_completion);
561 kmem_free(work);
562 }
563 } 499 }
500 rcu_read_unlock();
501}
564 502
565 return 0; 503/*
504 * This is a fast pass over the inode cache to try to get reclaim moving on as
505 * many inodes as possible in a short period of time. It kicks itself every few
506 * seconds, as well as being kicked by the inode cache shrinker when memory
507 * goes low. It scans as quickly as possible avoiding locked inodes or those
508 * already being flushed, and once done schedules a future pass.
509 */
510STATIC void
511xfs_reclaim_worker(
512 struct work_struct *work)
513{
514 struct xfs_mount *mp = container_of(to_delayed_work(work),
515 struct xfs_mount, m_reclaim_work);
516
517 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
518 xfs_syncd_queue_reclaim(mp);
519}
520
521/*
522 * Flush delayed allocate data, attempting to free up reserved space
523 * from existing allocations. At this point a new allocation attempt
524 * has failed with ENOSPC and we are in the process of scratching our
525 * heads, looking about for more room.
526 *
527 * Queue a new data flush if there isn't one already in progress and
528 * wait for completion of the flush. This means that we only ever have one
529 * inode flush in progress no matter how many ENOSPC events are occurring and
530 * so will prevent the system from bogging down due to every concurrent
531 * ENOSPC event scanning all the active inodes in the system for writeback.
532 */
533void
534xfs_flush_inodes(
535 struct xfs_inode *ip)
536{
537 struct xfs_mount *mp = ip->i_mount;
538
539 queue_work(xfs_syncd_wq, &mp->m_flush_work);
540 flush_work_sync(&mp->m_flush_work);
541}
542
543STATIC void
544xfs_flush_worker(
545 struct work_struct *work)
546{
547 struct xfs_mount *mp = container_of(work,
548 struct xfs_mount, m_flush_work);
549
550 xfs_sync_data(mp, SYNC_TRYLOCK);
551 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
566} 552}
567 553
568int 554int
569xfs_syncd_init( 555xfs_syncd_init(
570 struct xfs_mount *mp) 556 struct xfs_mount *mp)
571{ 557{
572 mp->m_sync_work.w_syncer = xfs_sync_worker; 558 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
573 mp->m_sync_work.w_mount = mp; 559 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
574 mp->m_sync_work.w_completion = NULL; 560 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
575 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); 561
576 if (IS_ERR(mp->m_sync_task)) 562 xfs_syncd_queue_sync(mp);
577 return -PTR_ERR(mp->m_sync_task); 563 xfs_syncd_queue_reclaim(mp);
564
578 return 0; 565 return 0;
579} 566}
580 567
@@ -582,7 +569,9 @@ void
582xfs_syncd_stop( 569xfs_syncd_stop(
583 struct xfs_mount *mp) 570 struct xfs_mount *mp)
584{ 571{
585 kthread_stop(mp->m_sync_task); 572 cancel_delayed_work_sync(&mp->m_sync_work);
573 cancel_delayed_work_sync(&mp->m_reclaim_work);
574 cancel_work_sync(&mp->m_flush_work);
586} 575}
587 576
588void 577void
@@ -601,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
601 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 590 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
602 XFS_ICI_RECLAIM_TAG); 591 XFS_ICI_RECLAIM_TAG);
603 spin_unlock(&ip->i_mount->m_perag_lock); 592 spin_unlock(&ip->i_mount->m_perag_lock);
593
594 /* schedule periodic background inode reclaim */
595 xfs_syncd_queue_reclaim(ip->i_mount);
596
604 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 597 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
605 -1, _RET_IP_); 598 -1, _RET_IP_);
606 } 599 }
@@ -1017,7 +1010,13 @@ xfs_reclaim_inodes(
1017} 1010}
1018 1011
1019/* 1012/*
1020 * Shrinker infrastructure. 1013 * Inode cache shrinker.
1014 *
1015 * When called we make sure that there is a background (fast) inode reclaim in
1016 * progress, while we will throttle the speed of reclaim via doiing synchronous
1017 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1018 * them to be cleaned, which we hope will not be very long due to the
1019 * background walker having already kicked the IO off on those dirty inodes.
1021 */ 1020 */
1022static int 1021static int
1023xfs_reclaim_inode_shrink( 1022xfs_reclaim_inode_shrink(
@@ -1032,10 +1031,15 @@ xfs_reclaim_inode_shrink(
1032 1031
1033 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1032 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
1034 if (nr_to_scan) { 1033 if (nr_to_scan) {
1034 /* kick background reclaimer and push the AIL */
1035 xfs_syncd_queue_reclaim(mp);
1036 xfs_ail_push_all(mp->m_ail);
1037
1035 if (!(gfp_mask & __GFP_FS)) 1038 if (!(gfp_mask & __GFP_FS))
1036 return -1; 1039 return -1;
1037 1040
1038 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); 1041 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
1042 &nr_to_scan);
1039 /* terminate if we don't exhaust the scan */ 1043 /* terminate if we don't exhaust the scan */
1040 if (nr_to_scan > 0) 1044 if (nr_to_scan > 0)
1041 return -1; 1045 return -1;