aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2010-01-10 18:51:45 -0500
committerAlex Elder <aelder@sgi.com>2010-01-15 14:43:55 -0500
commitc8e20be020f234c8d492927a424a7d8bbefd5b5d (patch)
treeced84d52bf87d72d36a65e3ddca6b4fc4b7f819f
parent7284ce6c9f6153d1777df5f310c959724d1bd446 (diff)
xfs: reclaim inodes under a write lock
Make the inode tree reclaim walk exclusive to avoid races with concurrent sync walkers and lookups. This is a version of a patch posted by Christoph Hellwig that avoids all the code duplication. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c154
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c2
3 files changed, 71 insertions, 87 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 6fed97a8cd3e..e19d25555c3f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -65,7 +65,6 @@ xfs_inode_ag_lookup(
65 * as the tree is sparse and a gang lookup walks to find 65 * as the tree is sparse and a gang lookup walks to find
66 * the number of objects requested. 66 * the number of objects requested.
67 */ 67 */
68 read_lock(&pag->pag_ici_lock);
69 if (tag == XFS_ICI_NO_TAG) { 68 if (tag == XFS_ICI_NO_TAG) {
70 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
71 (void **)&ip, *first_index, 1); 70 (void **)&ip, *first_index, 1);
@@ -74,7 +73,7 @@ xfs_inode_ag_lookup(
74 (void **)&ip, *first_index, 1, tag); 73 (void **)&ip, *first_index, 1, tag);
75 } 74 }
76 if (!nr_found) 75 if (!nr_found)
77 goto unlock; 76 return NULL;
78 77
79 /* 78 /*
80 * Update the index for the next lookup. Catch overflows 79 * Update the index for the next lookup. Catch overflows
@@ -84,13 +83,8 @@ xfs_inode_ag_lookup(
84 */ 83 */
85 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 84 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
86 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 85 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
87 goto unlock; 86 return NULL;
88
89 return ip; 87 return ip;
90
91unlock:
92 read_unlock(&pag->pag_ici_lock);
93 return NULL;
94} 88}
95 89
96STATIC int 90STATIC int
@@ -100,7 +94,8 @@ xfs_inode_ag_walk(
100 int (*execute)(struct xfs_inode *ip, 94 int (*execute)(struct xfs_inode *ip,
101 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
102 int flags, 96 int flags,
103 int tag) 97 int tag,
98 int exclusive)
104{ 99{
105 struct xfs_perag *pag = &mp->m_perag[ag]; 100 struct xfs_perag *pag = &mp->m_perag[ag];
106 uint32_t first_index; 101 uint32_t first_index;
@@ -114,10 +109,20 @@ restart:
114 int error = 0; 109 int error = 0;
115 xfs_inode_t *ip; 110 xfs_inode_t *ip;
116 111
112 if (exclusive)
113 write_lock(&pag->pag_ici_lock);
114 else
115 read_lock(&pag->pag_ici_lock);
117 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 116 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
118 if (!ip) 117 if (!ip) {
118 if (exclusive)
119 write_unlock(&pag->pag_ici_lock);
120 else
121 read_unlock(&pag->pag_ici_lock);
119 break; 122 break;
123 }
120 124
125 /* execute releases pag->pag_ici_lock */
121 error = execute(ip, pag, flags); 126 error = execute(ip, pag, flags);
122 if (error == EAGAIN) { 127 if (error == EAGAIN) {
123 skipped++; 128 skipped++;
@@ -125,9 +130,8 @@ restart:
125 } 130 }
126 if (error) 131 if (error)
127 last_error = error; 132 last_error = error;
128 /* 133
129 * bail out if the filesystem is corrupted. 134 /* bail out if the filesystem is corrupted. */
130 */
131 if (error == EFSCORRUPTED) 135 if (error == EFSCORRUPTED)
132 break; 136 break;
133 137
@@ -148,7 +152,8 @@ xfs_inode_ag_iterator(
148 int (*execute)(struct xfs_inode *ip, 152 int (*execute)(struct xfs_inode *ip,
149 struct xfs_perag *pag, int flags), 153 struct xfs_perag *pag, int flags),
150 int flags, 154 int flags,
151 int tag) 155 int tag,
156 int exclusive)
152{ 157{
153 int error = 0; 158 int error = 0;
154 int last_error = 0; 159 int last_error = 0;
@@ -157,7 +162,8 @@ xfs_inode_ag_iterator(
157 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 162 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
158 if (!mp->m_perag[ag].pag_ici_init) 163 if (!mp->m_perag[ag].pag_ici_init)
159 continue; 164 continue;
160 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); 165 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
166 exclusive);
161 if (error) { 167 if (error) {
162 last_error = error; 168 last_error = error;
163 if (error == EFSCORRUPTED) 169 if (error == EFSCORRUPTED)
@@ -181,11 +187,7 @@ xfs_sync_inode_valid(
181 return EFSCORRUPTED; 187 return EFSCORRUPTED;
182 } 188 }
183 189
184 /* 190 /* If we can't get a reference on the inode, it must be in reclaim. */
185 * If we can't get a reference on the inode, it must be in reclaim.
186 * Leave it for the reclaim code to flush. Also avoid inodes that
187 * haven't been fully initialised.
188 */
189 if (!igrab(inode)) { 191 if (!igrab(inode)) {
190 read_unlock(&pag->pag_ici_lock); 192 read_unlock(&pag->pag_ici_lock);
191 return ENOENT; 193 return ENOENT;
@@ -282,7 +284,7 @@ xfs_sync_data(
282 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 284 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
283 285
284 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 286 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
285 XFS_ICI_NO_TAG); 287 XFS_ICI_NO_TAG, 0);
286 if (error) 288 if (error)
287 return XFS_ERROR(error); 289 return XFS_ERROR(error);
288 290
@@ -304,7 +306,7 @@ xfs_sync_attr(
304 ASSERT((flags & ~SYNC_WAIT) == 0); 306 ASSERT((flags & ~SYNC_WAIT) == 0);
305 307
306 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 308 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
307 XFS_ICI_NO_TAG); 309 XFS_ICI_NO_TAG, 0);
308} 310}
309 311
310STATIC int 312STATIC int
@@ -664,60 +666,6 @@ xfs_syncd_stop(
664 kthread_stop(mp->m_sync_task); 666 kthread_stop(mp->m_sync_task);
665} 667}
666 668
667STATIC int
668xfs_reclaim_inode(
669 xfs_inode_t *ip,
670 int sync_mode)
671{
672 xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
673
674 /* The hash lock here protects a thread in xfs_iget_core from
675 * racing with us on linking the inode back with a vnode.
676 * Once we have the XFS_IRECLAIM flag set it will not touch
677 * us.
678 */
679 write_lock(&pag->pag_ici_lock);
680 spin_lock(&ip->i_flags_lock);
681 if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
682 !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
683 spin_unlock(&ip->i_flags_lock);
684 write_unlock(&pag->pag_ici_lock);
685 return -EAGAIN;
686 }
687 __xfs_iflags_set(ip, XFS_IRECLAIM);
688 spin_unlock(&ip->i_flags_lock);
689 write_unlock(&pag->pag_ici_lock);
690 xfs_put_perag(ip->i_mount, pag);
691
692 /*
693 * If the inode is still dirty, then flush it out. If the inode
694 * is not in the AIL, then it will be OK to flush it delwri as
695 * long as xfs_iflush() does not keep any references to the inode.
696 * We leave that decision up to xfs_iflush() since it has the
697 * knowledge of whether it's OK to simply do a delwri flush of
698 * the inode or whether we need to wait until the inode is
699 * pulled from the AIL.
700 * We get the flush lock regardless, though, just to make sure
701 * we don't free it while it is being flushed.
702 */
703 xfs_ilock(ip, XFS_ILOCK_EXCL);
704 xfs_iflock(ip);
705
706 /*
707 * In the case of a forced shutdown we rely on xfs_iflush() to
708 * wait for the inode to be unpinned before returning an error.
709 */
710 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
711 /* synchronize with xfs_iflush_done */
712 xfs_iflock(ip);
713 xfs_ifunlock(ip);
714 }
715
716 xfs_iunlock(ip, XFS_ILOCK_EXCL);
717 xfs_ireclaim(ip);
718 return 0;
719}
720
721void 669void
722__xfs_inode_set_reclaim_tag( 670__xfs_inode_set_reclaim_tag(
723 struct xfs_perag *pag, 671 struct xfs_perag *pag,
@@ -760,19 +708,55 @@ __xfs_inode_clear_reclaim_tag(
760} 708}
761 709
762STATIC int 710STATIC int
763xfs_reclaim_inode_now( 711xfs_reclaim_inode(
764 struct xfs_inode *ip, 712 struct xfs_inode *ip,
765 struct xfs_perag *pag, 713 struct xfs_perag *pag,
766 int flags) 714 int sync_mode)
767{ 715{
768 /* ignore if already under reclaim */ 716 /*
769 if (xfs_iflags_test(ip, XFS_IRECLAIM)) { 717 * The radix tree lock here protects a thread in xfs_iget from racing
770 read_unlock(&pag->pag_ici_lock); 718 * with us starting reclaim on the inode. Once we have the
719 * XFS_IRECLAIM flag set it will not touch us.
720 */
721 spin_lock(&ip->i_flags_lock);
722 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
723 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
724 /* ignore as it is already under reclaim */
725 spin_unlock(&ip->i_flags_lock);
726 write_unlock(&pag->pag_ici_lock);
771 return 0; 727 return 0;
772 } 728 }
773 read_unlock(&pag->pag_ici_lock); 729 __xfs_iflags_set(ip, XFS_IRECLAIM);
730 spin_unlock(&ip->i_flags_lock);
731 write_unlock(&pag->pag_ici_lock);
732
733 /*
734 * If the inode is still dirty, then flush it out. If the inode
735 * is not in the AIL, then it will be OK to flush it delwri as
736 * long as xfs_iflush() does not keep any references to the inode.
737 * We leave that decision up to xfs_iflush() since it has the
738 * knowledge of whether it's OK to simply do a delwri flush of
739 * the inode or whether we need to wait until the inode is
740 * pulled from the AIL.
741 * We get the flush lock regardless, though, just to make sure
742 * we don't free it while it is being flushed.
743 */
744 xfs_ilock(ip, XFS_ILOCK_EXCL);
745 xfs_iflock(ip);
774 746
775 return xfs_reclaim_inode(ip, flags); 747 /*
748 * In the case of a forced shutdown we rely on xfs_iflush() to
749 * wait for the inode to be unpinned before returning an error.
750 */
751 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
752 /* synchronize with xfs_iflush_done */
753 xfs_iflock(ip);
754 xfs_ifunlock(ip);
755 }
756
757 xfs_iunlock(ip, XFS_ILOCK_EXCL);
758 xfs_ireclaim(ip);
759 return 0;
776} 760}
777 761
778int 762int
@@ -780,6 +764,6 @@ xfs_reclaim_inodes(
780 xfs_mount_t *mp, 764 xfs_mount_t *mp,
781 int mode) 765 int mode)
782{ 766{
783 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, 767 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
784 XFS_ICI_RECLAIM_TAG); 768 XFS_ICI_RECLAIM_TAG, 1);
785} 769}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index a500b4d91835..ea932b43335d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
54int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 54int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
55int xfs_inode_ag_iterator(struct xfs_mount *mp, 55int xfs_inode_ag_iterator(struct xfs_mount *mp,
56 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 56 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
57 int flags, int tag); 57 int flags, int tag, int write_lock);
58 58
59#endif 59#endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 71af76fe8a23..873e07e29074 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
891 uint flags) 891 uint flags)
892{ 892{
893 ASSERT(mp->m_quotainfo); 893 ASSERT(mp->m_quotainfo);
894 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); 894 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
895} 895}
896 896
897/*------------------------------------------------------------------------*/ 897/*------------------------------------------------------------------------*/