diff options
author | Dave Chinner <david@fromorbit.com> | 2010-01-10 18:51:45 -0500 |
---|---|---|
committer | Alex Elder <aelder@sgi.com> | 2010-01-15 14:43:55 -0500 |
commit | c8e20be020f234c8d492927a424a7d8bbefd5b5d (patch) | |
tree | ced84d52bf87d72d36a65e3ddca6b4fc4b7f819f | |
parent | 7284ce6c9f6153d1777df5f310c959724d1bd446 (diff) |
xfs: reclaim inodes under a write lock
Make the inode tree reclaim walk exclusive to avoid races with
concurrent sync walkers and lookups. This is a version of a patch
posted by Christoph Hellwig that avoids all the code duplication.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 154 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.h | 2 | ||||
-rw-r--r-- | fs/xfs/quota/xfs_qm_syscalls.c | 2 |
3 files changed, 71 insertions, 87 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 6fed97a8cd3e..e19d25555c3f 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
@@ -65,7 +65,6 @@ xfs_inode_ag_lookup( | |||
65 | * as the tree is sparse and a gang lookup walks to find | 65 | * as the tree is sparse and a gang lookup walks to find |
66 | * the number of objects requested. | 66 | * the number of objects requested. |
67 | */ | 67 | */ |
68 | read_lock(&pag->pag_ici_lock); | ||
69 | if (tag == XFS_ICI_NO_TAG) { | 68 | if (tag == XFS_ICI_NO_TAG) { |
70 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, | 69 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, |
71 | (void **)&ip, *first_index, 1); | 70 | (void **)&ip, *first_index, 1); |
@@ -74,7 +73,7 @@ xfs_inode_ag_lookup( | |||
74 | (void **)&ip, *first_index, 1, tag); | 73 | (void **)&ip, *first_index, 1, tag); |
75 | } | 74 | } |
76 | if (!nr_found) | 75 | if (!nr_found) |
77 | goto unlock; | 76 | return NULL; |
78 | 77 | ||
79 | /* | 78 | /* |
80 | * Update the index for the next lookup. Catch overflows | 79 | * Update the index for the next lookup. Catch overflows |
@@ -84,13 +83,8 @@ xfs_inode_ag_lookup( | |||
84 | */ | 83 | */ |
85 | *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); | 84 | *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); |
86 | if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) | 85 | if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) |
87 | goto unlock; | 86 | return NULL; |
88 | |||
89 | return ip; | 87 | return ip; |
90 | |||
91 | unlock: | ||
92 | read_unlock(&pag->pag_ici_lock); | ||
93 | return NULL; | ||
94 | } | 88 | } |
95 | 89 | ||
96 | STATIC int | 90 | STATIC int |
@@ -100,7 +94,8 @@ xfs_inode_ag_walk( | |||
100 | int (*execute)(struct xfs_inode *ip, | 94 | int (*execute)(struct xfs_inode *ip, |
101 | struct xfs_perag *pag, int flags), | 95 | struct xfs_perag *pag, int flags), |
102 | int flags, | 96 | int flags, |
103 | int tag) | 97 | int tag, |
98 | int exclusive) | ||
104 | { | 99 | { |
105 | struct xfs_perag *pag = &mp->m_perag[ag]; | 100 | struct xfs_perag *pag = &mp->m_perag[ag]; |
106 | uint32_t first_index; | 101 | uint32_t first_index; |
@@ -114,10 +109,20 @@ restart: | |||
114 | int error = 0; | 109 | int error = 0; |
115 | xfs_inode_t *ip; | 110 | xfs_inode_t *ip; |
116 | 111 | ||
112 | if (exclusive) | ||
113 | write_lock(&pag->pag_ici_lock); | ||
114 | else | ||
115 | read_lock(&pag->pag_ici_lock); | ||
117 | ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); | 116 | ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); |
118 | if (!ip) | 117 | if (!ip) { |
118 | if (exclusive) | ||
119 | write_unlock(&pag->pag_ici_lock); | ||
120 | else | ||
121 | read_unlock(&pag->pag_ici_lock); | ||
119 | break; | 122 | break; |
123 | } | ||
120 | 124 | ||
125 | /* execute releases pag->pag_ici_lock */ | ||
121 | error = execute(ip, pag, flags); | 126 | error = execute(ip, pag, flags); |
122 | if (error == EAGAIN) { | 127 | if (error == EAGAIN) { |
123 | skipped++; | 128 | skipped++; |
@@ -125,9 +130,8 @@ restart: | |||
125 | } | 130 | } |
126 | if (error) | 131 | if (error) |
127 | last_error = error; | 132 | last_error = error; |
128 | /* | 133 | |
129 | * bail out if the filesystem is corrupted. | 134 | /* bail out if the filesystem is corrupted. */ |
130 | */ | ||
131 | if (error == EFSCORRUPTED) | 135 | if (error == EFSCORRUPTED) |
132 | break; | 136 | break; |
133 | 137 | ||
@@ -148,7 +152,8 @@ xfs_inode_ag_iterator( | |||
148 | int (*execute)(struct xfs_inode *ip, | 152 | int (*execute)(struct xfs_inode *ip, |
149 | struct xfs_perag *pag, int flags), | 153 | struct xfs_perag *pag, int flags), |
150 | int flags, | 154 | int flags, |
151 | int tag) | 155 | int tag, |
156 | int exclusive) | ||
152 | { | 157 | { |
153 | int error = 0; | 158 | int error = 0; |
154 | int last_error = 0; | 159 | int last_error = 0; |
@@ -157,7 +162,8 @@ xfs_inode_ag_iterator( | |||
157 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { | 162 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { |
158 | if (!mp->m_perag[ag].pag_ici_init) | 163 | if (!mp->m_perag[ag].pag_ici_init) |
159 | continue; | 164 | continue; |
160 | error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); | 165 | error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, |
166 | exclusive); | ||
161 | if (error) { | 167 | if (error) { |
162 | last_error = error; | 168 | last_error = error; |
163 | if (error == EFSCORRUPTED) | 169 | if (error == EFSCORRUPTED) |
@@ -181,11 +187,7 @@ xfs_sync_inode_valid( | |||
181 | return EFSCORRUPTED; | 187 | return EFSCORRUPTED; |
182 | } | 188 | } |
183 | 189 | ||
184 | /* | 190 | /* If we can't get a reference on the inode, it must be in reclaim. */ |
185 | * If we can't get a reference on the inode, it must be in reclaim. | ||
186 | * Leave it for the reclaim code to flush. Also avoid inodes that | ||
187 | * haven't been fully initialised. | ||
188 | */ | ||
189 | if (!igrab(inode)) { | 191 | if (!igrab(inode)) { |
190 | read_unlock(&pag->pag_ici_lock); | 192 | read_unlock(&pag->pag_ici_lock); |
191 | return ENOENT; | 193 | return ENOENT; |
@@ -282,7 +284,7 @@ xfs_sync_data( | |||
282 | ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); | 284 | ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); |
283 | 285 | ||
284 | error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, | 286 | error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, |
285 | XFS_ICI_NO_TAG); | 287 | XFS_ICI_NO_TAG, 0); |
286 | if (error) | 288 | if (error) |
287 | return XFS_ERROR(error); | 289 | return XFS_ERROR(error); |
288 | 290 | ||
@@ -304,7 +306,7 @@ xfs_sync_attr( | |||
304 | ASSERT((flags & ~SYNC_WAIT) == 0); | 306 | ASSERT((flags & ~SYNC_WAIT) == 0); |
305 | 307 | ||
306 | return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, | 308 | return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, |
307 | XFS_ICI_NO_TAG); | 309 | XFS_ICI_NO_TAG, 0); |
308 | } | 310 | } |
309 | 311 | ||
310 | STATIC int | 312 | STATIC int |
@@ -664,60 +666,6 @@ xfs_syncd_stop( | |||
664 | kthread_stop(mp->m_sync_task); | 666 | kthread_stop(mp->m_sync_task); |
665 | } | 667 | } |
666 | 668 | ||
667 | STATIC int | ||
668 | xfs_reclaim_inode( | ||
669 | xfs_inode_t *ip, | ||
670 | int sync_mode) | ||
671 | { | ||
672 | xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); | ||
673 | |||
674 | /* The hash lock here protects a thread in xfs_iget_core from | ||
675 | * racing with us on linking the inode back with a vnode. | ||
676 | * Once we have the XFS_IRECLAIM flag set it will not touch | ||
677 | * us. | ||
678 | */ | ||
679 | write_lock(&pag->pag_ici_lock); | ||
680 | spin_lock(&ip->i_flags_lock); | ||
681 | if (__xfs_iflags_test(ip, XFS_IRECLAIM) || | ||
682 | !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { | ||
683 | spin_unlock(&ip->i_flags_lock); | ||
684 | write_unlock(&pag->pag_ici_lock); | ||
685 | return -EAGAIN; | ||
686 | } | ||
687 | __xfs_iflags_set(ip, XFS_IRECLAIM); | ||
688 | spin_unlock(&ip->i_flags_lock); | ||
689 | write_unlock(&pag->pag_ici_lock); | ||
690 | xfs_put_perag(ip->i_mount, pag); | ||
691 | |||
692 | /* | ||
693 | * If the inode is still dirty, then flush it out. If the inode | ||
694 | * is not in the AIL, then it will be OK to flush it delwri as | ||
695 | * long as xfs_iflush() does not keep any references to the inode. | ||
696 | * We leave that decision up to xfs_iflush() since it has the | ||
697 | * knowledge of whether it's OK to simply do a delwri flush of | ||
698 | * the inode or whether we need to wait until the inode is | ||
699 | * pulled from the AIL. | ||
700 | * We get the flush lock regardless, though, just to make sure | ||
701 | * we don't free it while it is being flushed. | ||
702 | */ | ||
703 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
704 | xfs_iflock(ip); | ||
705 | |||
706 | /* | ||
707 | * In the case of a forced shutdown we rely on xfs_iflush() to | ||
708 | * wait for the inode to be unpinned before returning an error. | ||
709 | */ | ||
710 | if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { | ||
711 | /* synchronize with xfs_iflush_done */ | ||
712 | xfs_iflock(ip); | ||
713 | xfs_ifunlock(ip); | ||
714 | } | ||
715 | |||
716 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
717 | xfs_ireclaim(ip); | ||
718 | return 0; | ||
719 | } | ||
720 | |||
721 | void | 669 | void |
722 | __xfs_inode_set_reclaim_tag( | 670 | __xfs_inode_set_reclaim_tag( |
723 | struct xfs_perag *pag, | 671 | struct xfs_perag *pag, |
@@ -760,19 +708,55 @@ __xfs_inode_clear_reclaim_tag( | |||
760 | } | 708 | } |
761 | 709 | ||
762 | STATIC int | 710 | STATIC int |
763 | xfs_reclaim_inode_now( | 711 | xfs_reclaim_inode( |
764 | struct xfs_inode *ip, | 712 | struct xfs_inode *ip, |
765 | struct xfs_perag *pag, | 713 | struct xfs_perag *pag, |
766 | int flags) | 714 | int sync_mode) |
767 | { | 715 | { |
768 | /* ignore if already under reclaim */ | 716 | /* |
769 | if (xfs_iflags_test(ip, XFS_IRECLAIM)) { | 717 | * The radix tree lock here protects a thread in xfs_iget from racing |
770 | read_unlock(&pag->pag_ici_lock); | 718 | * with us starting reclaim on the inode. Once we have the |
719 | * XFS_IRECLAIM flag set it will not touch us. | ||
720 | */ | ||
721 | spin_lock(&ip->i_flags_lock); | ||
722 | ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); | ||
723 | if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { | ||
724 | /* ignore as it is already under reclaim */ | ||
725 | spin_unlock(&ip->i_flags_lock); | ||
726 | write_unlock(&pag->pag_ici_lock); | ||
771 | return 0; | 727 | return 0; |
772 | } | 728 | } |
773 | read_unlock(&pag->pag_ici_lock); | 729 | __xfs_iflags_set(ip, XFS_IRECLAIM); |
730 | spin_unlock(&ip->i_flags_lock); | ||
731 | write_unlock(&pag->pag_ici_lock); | ||
732 | |||
733 | /* | ||
734 | * If the inode is still dirty, then flush it out. If the inode | ||
735 | * is not in the AIL, then it will be OK to flush it delwri as | ||
736 | * long as xfs_iflush() does not keep any references to the inode. | ||
737 | * We leave that decision up to xfs_iflush() since it has the | ||
738 | * knowledge of whether it's OK to simply do a delwri flush of | ||
739 | * the inode or whether we need to wait until the inode is | ||
740 | * pulled from the AIL. | ||
741 | * We get the flush lock regardless, though, just to make sure | ||
742 | * we don't free it while it is being flushed. | ||
743 | */ | ||
744 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
745 | xfs_iflock(ip); | ||
774 | 746 | ||
775 | return xfs_reclaim_inode(ip, flags); | 747 | /* |
748 | * In the case of a forced shutdown we rely on xfs_iflush() to | ||
749 | * wait for the inode to be unpinned before returning an error. | ||
750 | */ | ||
751 | if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { | ||
752 | /* synchronize with xfs_iflush_done */ | ||
753 | xfs_iflock(ip); | ||
754 | xfs_ifunlock(ip); | ||
755 | } | ||
756 | |||
757 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
758 | xfs_ireclaim(ip); | ||
759 | return 0; | ||
776 | } | 760 | } |
777 | 761 | ||
778 | int | 762 | int |
@@ -780,6 +764,6 @@ xfs_reclaim_inodes( | |||
780 | xfs_mount_t *mp, | 764 | xfs_mount_t *mp, |
781 | int mode) | 765 | int mode) |
782 | { | 766 | { |
783 | return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, | 767 | return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, |
784 | XFS_ICI_RECLAIM_TAG); | 768 | XFS_ICI_RECLAIM_TAG, 1); |
785 | } | 769 | } |
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index a500b4d91835..ea932b43335d 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h | |||
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, | |||
54 | int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); | 54 | int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); |
55 | int xfs_inode_ag_iterator(struct xfs_mount *mp, | 55 | int xfs_inode_ag_iterator(struct xfs_mount *mp, |
56 | int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), | 56 | int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), |
57 | int flags, int tag); | 57 | int flags, int tag, int write_lock); |
58 | 58 | ||
59 | #endif | 59 | #endif |
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 71af76fe8a23..873e07e29074 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c | |||
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes( | |||
891 | uint flags) | 891 | uint flags) |
892 | { | 892 | { |
893 | ASSERT(mp->m_quotainfo); | 893 | ASSERT(mp->m_quotainfo); |
894 | xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); | 894 | xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); |
895 | } | 895 | } |
896 | 896 | ||
897 | /*------------------------------------------------------------------------*/ | 897 | /*------------------------------------------------------------------------*/ |