diff options
author | David Chinner <david@fromorbit.com> | 2008-10-30 02:07:29 -0400 |
---|---|---|
committer | Lachlan McIlroy <lachlan@sgi.com> | 2008-10-30 02:07:29 -0400 |
commit | 683a897080a053733778b36398186cb1b22c377f (patch) | |
tree | be084696bf3925a0bbeb37edf25e05168891732b /fs | |
parent | 2f8a3ce1c20f20e6494cdb77fed76bc474ca3ca5 (diff) |
[XFS] Use the inode tree for finding dirty inodes
Update xfs_sync_inodes to walk the inode radix tree cache to find dirty
inodes. This removes a huge bunch of nasty, messy code for traversing the
mount inode list safely and removes another user of the mount inode list.
Version 3 o rediff against new linux-2.6/xfs_sync.c code
Version 2 o add comment explaining use of gang lookups for a single inode
o use IRELE, not VN_RELE o move check for ag initialisation to caller.
SGI-PV: 988139
SGI-Modid: xfs-linux-melb:xfs-kern:32290a
Signed-off-by: David Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 361 |
1 files changed, 101 insertions, 260 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index cd82ba523dc4..53d85ecb1d50 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
@@ -121,356 +121,197 @@ xfs_sync( | |||
121 | } | 121 | } |
122 | 122 | ||
123 | /* | 123 | /* |
124 | * xfs sync routine for internal use | 124 | * Sync all the inodes in the given AG according to the |
125 | * | 125 | * direction given by the flags. |
126 | * This routine supports all of the flags defined for the generic vfs_sync | ||
127 | * interface as explained above under xfs_sync. | ||
128 | * | ||
129 | */ | 126 | */ |
130 | int | 127 | STATIC int |
131 | xfs_sync_inodes( | 128 | xfs_sync_inodes_ag( |
132 | xfs_mount_t *mp, | 129 | xfs_mount_t *mp, |
130 | int ag, | ||
133 | int flags, | 131 | int flags, |
134 | int *bypassed) | 132 | int *bypassed) |
135 | { | 133 | { |
136 | xfs_inode_t *ip = NULL; | 134 | xfs_inode_t *ip = NULL; |
137 | struct inode *vp = NULL; | 135 | struct inode *vp = NULL; |
138 | int error; | 136 | xfs_perag_t *pag = &mp->m_perag[ag]; |
139 | int last_error; | 137 | boolean_t vnode_refed = B_FALSE; |
140 | uint64_t fflag; | 138 | int nr_found; |
141 | uint lock_flags; | 139 | int first_index = 0; |
142 | uint base_lock_flags; | 140 | int error = 0; |
143 | boolean_t mount_locked; | 141 | int last_error = 0; |
144 | boolean_t vnode_refed; | 142 | int fflag = XFS_B_ASYNC; |
145 | int preempt; | 143 | int lock_flags = XFS_ILOCK_SHARED; |
146 | xfs_iptr_t *ipointer; | ||
147 | #ifdef DEBUG | ||
148 | boolean_t ipointer_in = B_FALSE; | ||
149 | |||
150 | #define IPOINTER_SET ipointer_in = B_TRUE | ||
151 | #define IPOINTER_CLR ipointer_in = B_FALSE | ||
152 | #else | ||
153 | #define IPOINTER_SET | ||
154 | #define IPOINTER_CLR | ||
155 | #endif | ||
156 | |||
157 | |||
158 | /* Insert a marker record into the inode list after inode ip. The list | ||
159 | * must be locked when this is called. After the call the list will no | ||
160 | * longer be locked. | ||
161 | */ | ||
162 | #define IPOINTER_INSERT(ip, mp) { \ | ||
163 | ASSERT(ipointer_in == B_FALSE); \ | ||
164 | ipointer->ip_mnext = ip->i_mnext; \ | ||
165 | ipointer->ip_mprev = ip; \ | ||
166 | ip->i_mnext = (xfs_inode_t *)ipointer; \ | ||
167 | ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \ | ||
168 | preempt = 0; \ | ||
169 | XFS_MOUNT_IUNLOCK(mp); \ | ||
170 | mount_locked = B_FALSE; \ | ||
171 | IPOINTER_SET; \ | ||
172 | } | ||
173 | |||
174 | /* Remove the marker from the inode list. If the marker was the only item | ||
175 | * in the list then there are no remaining inodes and we should zero out | ||
176 | * the whole list. If we are the current head of the list then move the head | ||
177 | * past us. | ||
178 | */ | ||
179 | #define IPOINTER_REMOVE(ip, mp) { \ | ||
180 | ASSERT(ipointer_in == B_TRUE); \ | ||
181 | if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \ | ||
182 | ip = ipointer->ip_mnext; \ | ||
183 | ip->i_mprev = ipointer->ip_mprev; \ | ||
184 | ipointer->ip_mprev->i_mnext = ip; \ | ||
185 | if (mp->m_inodes == (xfs_inode_t *)ipointer) { \ | ||
186 | mp->m_inodes = ip; \ | ||
187 | } \ | ||
188 | } else { \ | ||
189 | ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \ | ||
190 | mp->m_inodes = NULL; \ | ||
191 | ip = NULL; \ | ||
192 | } \ | ||
193 | IPOINTER_CLR; \ | ||
194 | } | ||
195 | |||
196 | #define XFS_PREEMPT_MASK 0x7f | ||
197 | |||
198 | ASSERT(!(flags & SYNC_BDFLUSH)); | ||
199 | |||
200 | if (bypassed) | ||
201 | *bypassed = 0; | ||
202 | if (mp->m_flags & XFS_MOUNT_RDONLY) | ||
203 | return 0; | ||
204 | error = 0; | ||
205 | last_error = 0; | ||
206 | preempt = 0; | ||
207 | |||
208 | /* Allocate a reference marker */ | ||
209 | ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP); | ||
210 | 144 | ||
211 | fflag = XFS_B_ASYNC; /* default is don't wait */ | ||
212 | if (flags & SYNC_DELWRI) | 145 | if (flags & SYNC_DELWRI) |
213 | fflag = XFS_B_DELWRI; | 146 | fflag = XFS_B_DELWRI; |
214 | if (flags & SYNC_WAIT) | 147 | if (flags & SYNC_WAIT) |
215 | fflag = 0; /* synchronous overrides all */ | 148 | fflag = 0; /* synchronous overrides all */ |
216 | 149 | ||
217 | base_lock_flags = XFS_ILOCK_SHARED; | ||
218 | if (flags & (SYNC_DELWRI | SYNC_CLOSE)) { | 150 | if (flags & (SYNC_DELWRI | SYNC_CLOSE)) { |
219 | /* | 151 | /* |
220 | * We need the I/O lock if we're going to call any of | 152 | * We need the I/O lock if we're going to call any of |
221 | * the flush/inval routines. | 153 | * the flush/inval routines. |
222 | */ | 154 | */ |
223 | base_lock_flags |= XFS_IOLOCK_SHARED; | 155 | lock_flags |= XFS_IOLOCK_SHARED; |
224 | } | 156 | } |
225 | 157 | ||
226 | XFS_MOUNT_ILOCK(mp); | ||
227 | |||
228 | ip = mp->m_inodes; | ||
229 | |||
230 | mount_locked = B_TRUE; | ||
231 | vnode_refed = B_FALSE; | ||
232 | |||
233 | IPOINTER_CLR; | ||
234 | |||
235 | do { | 158 | do { |
236 | ASSERT(ipointer_in == B_FALSE); | ||
237 | ASSERT(vnode_refed == B_FALSE); | ||
238 | |||
239 | lock_flags = base_lock_flags; | ||
240 | |||
241 | /* | 159 | /* |
242 | * There were no inodes in the list, just break out | 160 | * use a gang lookup to find the next inode in the tree |
243 | * of the loop. | 161 | * as the tree is sparse and a gang lookup walks to find |
162 | * the number of objects requested. | ||
244 | */ | 163 | */ |
245 | if (ip == NULL) { | 164 | read_lock(&pag->pag_ici_lock); |
246 | break; | 165 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, |
247 | } | 166 | (void**)&ip, first_index, 1); |
248 | 167 | ||
249 | /* | 168 | if (!nr_found) { |
250 | * We found another sync thread marker - skip it | 169 | read_unlock(&pag->pag_ici_lock); |
251 | */ | 170 | break; |
252 | if (ip->i_mount == NULL) { | ||
253 | ip = ip->i_mnext; | ||
254 | continue; | ||
255 | } | 171 | } |
256 | 172 | ||
257 | vp = VFS_I(ip); | 173 | /* update the index for the next lookup */ |
174 | first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); | ||
258 | 175 | ||
259 | /* | 176 | /* |
260 | * If the vnode is gone then this is being torn down, | 177 | * skip inodes in reclaim. Let xfs_syncsub do that for |
261 | * call reclaim if it is flushed, else let regular flush | 178 | * us so we don't need to worry. |
262 | * code deal with it later in the loop. | ||
263 | */ | 179 | */ |
264 | 180 | vp = VFS_I(ip); | |
265 | if (vp == NULL) { | 181 | if (!vp) { |
266 | /* Skip ones already in reclaim */ | 182 | read_unlock(&pag->pag_ici_lock); |
267 | if (ip->i_flags & XFS_IRECLAIM) { | ||
268 | ip = ip->i_mnext; | ||
269 | continue; | ||
270 | } | ||
271 | if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { | ||
272 | ip = ip->i_mnext; | ||
273 | } else if ((xfs_ipincount(ip) == 0) && | ||
274 | xfs_iflock_nowait(ip)) { | ||
275 | IPOINTER_INSERT(ip, mp); | ||
276 | |||
277 | xfs_finish_reclaim(ip, 1, | ||
278 | XFS_IFLUSH_DELWRI_ELSE_ASYNC); | ||
279 | |||
280 | XFS_MOUNT_ILOCK(mp); | ||
281 | mount_locked = B_TRUE; | ||
282 | IPOINTER_REMOVE(ip, mp); | ||
283 | } else { | ||
284 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
285 | ip = ip->i_mnext; | ||
286 | } | ||
287 | continue; | 183 | continue; |
288 | } | 184 | } |
289 | 185 | ||
186 | /* bad inodes are dealt with elsewhere */ | ||
290 | if (VN_BAD(vp)) { | 187 | if (VN_BAD(vp)) { |
291 | ip = ip->i_mnext; | 188 | read_unlock(&pag->pag_ici_lock); |
292 | continue; | 189 | continue; |
293 | } | 190 | } |
294 | 191 | ||
192 | /* nothing to sync during shutdown */ | ||
295 | if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) { | 193 | if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) { |
296 | XFS_MOUNT_IUNLOCK(mp); | 194 | read_unlock(&pag->pag_ici_lock); |
297 | kmem_free(ipointer); | ||
298 | return 0; | 195 | return 0; |
299 | } | 196 | } |
300 | 197 | ||
301 | /* | 198 | /* |
302 | * Try to lock without sleeping. We're out of order with | 199 | * The inode lock here actually coordinates with the almost |
303 | * the inode list lock here, so if we fail we need to drop | 200 | * spurious inode lock in xfs_ireclaim() to prevent the vnode |
304 | * the mount lock and try again. If we're called from | 201 | * we handle here without a reference from being freed while we |
305 | * bdflush() here, then don't bother. | 202 | * reference it. If we lock the inode while it's on the mount |
306 | * | 203 | * list here, then the spurious inode lock in xfs_ireclaim() |
307 | * The inode lock here actually coordinates with the | 204 | * after the inode is pulled from the mount list will sleep |
308 | * almost spurious inode lock in xfs_ireclaim() to prevent | 205 | * until we release it here. This keeps the vnode from being |
309 | * the vnode we handle here without a reference from | 206 | * freed while we reference it. |
310 | * being freed while we reference it. If we lock the inode | ||
311 | * while it's on the mount list here, then the spurious inode | ||
312 | * lock in xfs_ireclaim() after the inode is pulled from | ||
313 | * the mount list will sleep until we release it here. | ||
314 | * This keeps the vnode from being freed while we reference | ||
315 | * it. | ||
316 | */ | 207 | */ |
317 | if (xfs_ilock_nowait(ip, lock_flags) == 0) { | 208 | if (xfs_ilock_nowait(ip, lock_flags) == 0) { |
318 | if (vp == NULL) { | ||
319 | ip = ip->i_mnext; | ||
320 | continue; | ||
321 | } | ||
322 | |||
323 | vp = vn_grab(vp); | 209 | vp = vn_grab(vp); |
324 | if (vp == NULL) { | 210 | read_unlock(&pag->pag_ici_lock); |
325 | ip = ip->i_mnext; | 211 | if (!vp) |
326 | continue; | 212 | continue; |
327 | } | ||
328 | |||
329 | IPOINTER_INSERT(ip, mp); | ||
330 | xfs_ilock(ip, lock_flags); | 213 | xfs_ilock(ip, lock_flags); |
331 | 214 | ||
332 | ASSERT(vp == VFS_I(ip)); | 215 | ASSERT(vp == VFS_I(ip)); |
333 | ASSERT(ip->i_mount == mp); | 216 | ASSERT(ip->i_mount == mp); |
334 | 217 | ||
335 | vnode_refed = B_TRUE; | 218 | vnode_refed = B_TRUE; |
219 | } else { | ||
220 | /* safe to unlock here as we have a reference */ | ||
221 | read_unlock(&pag->pag_ici_lock); | ||
336 | } | 222 | } |
337 | |||
338 | /* From here on in the loop we may have a marker record | ||
339 | * in the inode list. | ||
340 | */ | ||
341 | |||
342 | /* | 223 | /* |
343 | * If we have to flush data or wait for I/O completion | 224 | * If we have to flush data or wait for I/O completion |
344 | * we need to drop the ilock that we currently hold. | 225 | * we need to drop the ilock that we currently hold. |
345 | * If we need to drop the lock, insert a marker if we | 226 | * If we need to drop the lock, insert a marker if we |
346 | * have not already done so. | 227 | * have not already done so. |
347 | */ | 228 | */ |
348 | if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) || | 229 | if (flags & SYNC_CLOSE) { |
349 | ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) { | ||
350 | if (mount_locked) { | ||
351 | IPOINTER_INSERT(ip, mp); | ||
352 | } | ||
353 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | 230 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
354 | 231 | if (XFS_FORCED_SHUTDOWN(mp)) | |
355 | if (flags & SYNC_CLOSE) { | 232 | xfs_tosspages(ip, 0, -1, FI_REMAPF); |
356 | /* Shutdown case. Flush and invalidate. */ | 233 | else |
357 | if (XFS_FORCED_SHUTDOWN(mp)) | 234 | error = xfs_flushinval_pages(ip, 0, -1, |
358 | xfs_tosspages(ip, 0, -1, | 235 | FI_REMAPF); |
359 | FI_REMAPF); | 236 | /* wait for I/O on freeze */ |
360 | else | ||
361 | error = xfs_flushinval_pages(ip, | ||
362 | 0, -1, FI_REMAPF); | ||
363 | } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) { | ||
364 | error = xfs_flush_pages(ip, 0, | ||
365 | -1, fflag, FI_NONE); | ||
366 | } | ||
367 | |||
368 | /* | ||
369 | * When freezing, we need to wait ensure all I/O (including direct | ||
370 | * I/O) is complete to ensure no further data modification can take | ||
371 | * place after this point | ||
372 | */ | ||
373 | if (flags & SYNC_IOWAIT) | 237 | if (flags & SYNC_IOWAIT) |
374 | vn_iowait(ip); | 238 | vn_iowait(ip); |
375 | 239 | ||
376 | xfs_ilock(ip, XFS_ILOCK_SHARED); | 240 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
377 | } | 241 | } |
378 | 242 | ||
379 | if ((flags & SYNC_ATTR) && | 243 | if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) { |
380 | (ip->i_update_core || | 244 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
381 | (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) { | 245 | error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE); |
382 | if (mount_locked) | 246 | if (flags & SYNC_IOWAIT) |
383 | IPOINTER_INSERT(ip, mp); | 247 | vn_iowait(ip); |
248 | xfs_ilock(ip, XFS_ILOCK_SHARED); | ||
249 | } | ||
384 | 250 | ||
251 | if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) { | ||
385 | if (flags & SYNC_WAIT) { | 252 | if (flags & SYNC_WAIT) { |
386 | xfs_iflock(ip); | 253 | xfs_iflock(ip); |
387 | error = xfs_iflush(ip, XFS_IFLUSH_SYNC); | 254 | if (!xfs_inode_clean(ip)) |
388 | 255 | error = xfs_iflush(ip, XFS_IFLUSH_SYNC); | |
389 | /* | 256 | else |
390 | * If we can't acquire the flush lock, then the inode | 257 | xfs_ifunlock(ip); |
391 | * is already being flushed so don't bother waiting. | ||
392 | * | ||
393 | * If we can lock it then do a delwri flush so we can | ||
394 | * combine multiple inode flushes in each disk write. | ||
395 | */ | ||
396 | } else if (xfs_iflock_nowait(ip)) { | 258 | } else if (xfs_iflock_nowait(ip)) { |
397 | error = xfs_iflush(ip, XFS_IFLUSH_DELWRI); | 259 | if (!xfs_inode_clean(ip)) |
260 | error = xfs_iflush(ip, XFS_IFLUSH_DELWRI); | ||
261 | else | ||
262 | xfs_ifunlock(ip); | ||
398 | } else if (bypassed) { | 263 | } else if (bypassed) { |
399 | (*bypassed)++; | 264 | (*bypassed)++; |
400 | } | 265 | } |
401 | } | 266 | } |
402 | 267 | ||
403 | if (lock_flags != 0) { | 268 | if (lock_flags) |
404 | xfs_iunlock(ip, lock_flags); | 269 | xfs_iunlock(ip, lock_flags); |
405 | } | ||
406 | 270 | ||
407 | if (vnode_refed) { | 271 | if (vnode_refed) { |
408 | /* | ||
409 | * If we had to take a reference on the vnode | ||
410 | * above, then wait until after we've unlocked | ||
411 | * the inode to release the reference. This is | ||
412 | * because we can be already holding the inode | ||
413 | * lock when IRELE() calls xfs_inactive(). | ||
414 | * | ||
415 | * Make sure to drop the mount lock before calling | ||
416 | * IRELE() so that we don't trip over ourselves if | ||
417 | * we have to go for the mount lock again in the | ||
418 | * inactive code. | ||
419 | */ | ||
420 | if (mount_locked) { | ||
421 | IPOINTER_INSERT(ip, mp); | ||
422 | } | ||
423 | |||
424 | IRELE(ip); | 272 | IRELE(ip); |
425 | |||
426 | vnode_refed = B_FALSE; | 273 | vnode_refed = B_FALSE; |
427 | } | 274 | } |
428 | 275 | ||
429 | if (error) { | 276 | if (error) |
430 | last_error = error; | 277 | last_error = error; |
431 | } | ||
432 | |||
433 | /* | 278 | /* |
434 | * bail out if the filesystem is corrupted. | 279 | * bail out if the filesystem is corrupted. |
435 | */ | 280 | */ |
436 | if (error == EFSCORRUPTED) { | 281 | if (error == EFSCORRUPTED) |
437 | if (!mount_locked) { | ||
438 | XFS_MOUNT_ILOCK(mp); | ||
439 | IPOINTER_REMOVE(ip, mp); | ||
440 | } | ||
441 | XFS_MOUNT_IUNLOCK(mp); | ||
442 | ASSERT(ipointer_in == B_FALSE); | ||
443 | kmem_free(ipointer); | ||
444 | return XFS_ERROR(error); | 282 | return XFS_ERROR(error); |
445 | } | ||
446 | |||
447 | /* Let other threads have a chance at the mount lock | ||
448 | * if we have looped many times without dropping the | ||
449 | * lock. | ||
450 | */ | ||
451 | if ((++preempt & XFS_PREEMPT_MASK) == 0) { | ||
452 | if (mount_locked) { | ||
453 | IPOINTER_INSERT(ip, mp); | ||
454 | } | ||
455 | } | ||
456 | |||
457 | if (mount_locked == B_FALSE) { | ||
458 | XFS_MOUNT_ILOCK(mp); | ||
459 | mount_locked = B_TRUE; | ||
460 | IPOINTER_REMOVE(ip, mp); | ||
461 | continue; | ||
462 | } | ||
463 | 283 | ||
464 | ASSERT(ipointer_in == B_FALSE); | 284 | } while (nr_found); |
465 | ip = ip->i_mnext; | ||
466 | 285 | ||
467 | } while (ip != mp->m_inodes); | 286 | return last_error; |
287 | } | ||
468 | 288 | ||
469 | XFS_MOUNT_IUNLOCK(mp); | 289 | int |
290 | xfs_sync_inodes( | ||
291 | xfs_mount_t *mp, | ||
292 | int flags, | ||
293 | int *bypassed) | ||
294 | { | ||
295 | int error; | ||
296 | int last_error; | ||
297 | int i; | ||
470 | 298 | ||
471 | ASSERT(ipointer_in == B_FALSE); | 299 | if (bypassed) |
300 | *bypassed = 0; | ||
301 | if (mp->m_flags & XFS_MOUNT_RDONLY) | ||
302 | return 0; | ||
303 | error = 0; | ||
304 | last_error = 0; | ||
472 | 305 | ||
473 | kmem_free(ipointer); | 306 | for (i = 0; i < mp->m_sb.sb_agcount; i++) { |
307 | if (!mp->m_perag[i].pag_ici_init) | ||
308 | continue; | ||
309 | error = xfs_sync_inodes_ag(mp, i, flags, bypassed); | ||
310 | if (error) | ||
311 | last_error = error; | ||
312 | if (error == EFSCORRUPTED) | ||
313 | break; | ||
314 | } | ||
474 | return XFS_ERROR(last_error); | 315 | return XFS_ERROR(last_error); |
475 | } | 316 | } |
476 | 317 | ||