aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2011-07-08 08:34:34 -0400
committerChristoph Hellwig <hch@lst.de>2011-07-08 08:34:34 -0400
commit8f04c47aa9712874af2c8816c2ca2a332cba80e4 (patch)
tree56f76e7d1443759ed68c6720e7f242950e220f8c /fs/xfs/xfs_inode.c
parent857b9778d86ccba7d7b42c9d8aeecde794ec8a6b (diff)
xfs: split xfs_itruncate_finish
Split the guts of xfs_itruncate_finish that loop over the existing extents and calls xfs_bunmapi on them into a new helper, xfs_itruncate_externs. Make xfs_attr_inactive call it directly instead of xfs_itruncate_finish, which allows to simplify the latter a lot, by only letting it deal with the data fork. As a result xfs_itruncate_finish is renamed to xfs_itruncate_data to make its use case more obvious. Also remove the sync parameter from xfs_itruncate_data, which has been unessecary since the introduction of the busy extent list in 2002, and completely dead code since 2003 when the XFS_BMAPI_ASYNC parameter was made a no-op. I can't actually see why the xfs_attr_inactive needs to set the transaction sync, but let's keep this patch simple and without changes in behaviour. Also avoid passing a useless argument to xfs_isize_check, and make it private to xfs_inode.c. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Alex Elder <aelder@sgi.com> Reviewed-by: Dave Chinner <dchinner@redhat.com>
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c357
1 files changed, 128 insertions, 229 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 82a282ab63dc..aa143b870afb 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -52,7 +52,7 @@ kmem_zone_t *xfs_ifork_zone;
52kmem_zone_t *xfs_inode_zone; 52kmem_zone_t *xfs_inode_zone;
53 53
54/* 54/*
55 * Used in xfs_itruncate(). This is the maximum number of extents 55 * Used in xfs_itruncate_extents(). This is the maximum number of extents
56 * freed from a file in a single transaction. 56 * freed from a file in a single transaction.
57 */ 57 */
58#define XFS_ITRUNC_MAX_EXTENTS 2 58#define XFS_ITRUNC_MAX_EXTENTS 2
@@ -1179,15 +1179,15 @@ xfs_ialloc(
1179 * at least do it for regular files. 1179 * at least do it for regular files.
1180 */ 1180 */
1181#ifdef DEBUG 1181#ifdef DEBUG
1182void 1182STATIC void
1183xfs_isize_check( 1183xfs_isize_check(
1184 xfs_mount_t *mp, 1184 struct xfs_inode *ip,
1185 xfs_inode_t *ip, 1185 xfs_fsize_t isize)
1186 xfs_fsize_t isize)
1187{ 1186{
1188 xfs_fileoff_t map_first; 1187 struct xfs_mount *mp = ip->i_mount;
1189 int nimaps; 1188 xfs_fileoff_t map_first;
1190 xfs_bmbt_irec_t imaps[2]; 1189 int nimaps;
1190 xfs_bmbt_irec_t imaps[2];
1191 1191
1192 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1192 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
1193 return; 1193 return;
@@ -1214,11 +1214,14 @@ xfs_isize_check(
1214 ASSERT(nimaps == 1); 1214 ASSERT(nimaps == 1);
1215 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1215 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1216} 1216}
1217#else /* DEBUG */
1218#define xfs_isize_check(ip, isize)
1217#endif /* DEBUG */ 1219#endif /* DEBUG */
1218 1220
1219/* 1221/*
1220 * Free up the underlying blocks past new_size. The new size must be 1222 * Free up the underlying blocks past new_size. The new size must be smaller
1221 * smaller than the current size. 1223 * than the current size. This routine can be used both for the attribute and
1224 * data fork, and does not modify the inode size, which is left to the caller.
1222 * 1225 *
1223 * The transaction passed to this routine must have made a permanent log 1226 * The transaction passed to this routine must have made a permanent log
1224 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1227 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
@@ -1230,31 +1233,6 @@ xfs_isize_check(
1230 * will be "held" within the returned transaction. This routine does NOT 1233 * will be "held" within the returned transaction. This routine does NOT
1231 * require any disk space to be reserved for it within the transaction. 1234 * require any disk space to be reserved for it within the transaction.
1232 * 1235 *
1233 * The fork parameter must be either XFS_ATTR_FORK or XFS_DATA_FORK, and it
1234 * indicates the fork which is to be truncated. For the attribute fork we only
1235 * support truncation to size 0.
1236 *
1237 * We use the sync parameter to indicate whether or not the first transaction
1238 * we perform might have to be synchronous. For the attr fork, it needs to be
1239 * so if the unlink of the inode is not yet known to be permanent in the log.
1240 * This keeps us from freeing and reusing the blocks of the attribute fork
1241 * before the unlink of the inode becomes permanent.
1242 *
1243 * For the data fork, we normally have to run synchronously if we're being
1244 * called out of the inactive path or we're being called out of the create path
1245 * where we're truncating an existing file. Either way, the truncate needs to
1246 * be sync so blocks don't reappear in the file with altered data in case of a
1247 * crash. wsync filesystems can run the first case async because anything that
1248 * shrinks the inode has to run sync so by the time we're called here from
1249 * inactive, the inode size is permanently set to 0.
1250 *
1251 * Calls from the truncate path always need to be sync unless we're in a wsync
1252 * filesystem and the file has already been unlinked.
1253 *
1254 * The caller is responsible for correctly setting the sync parameter. It gets
1255 * too hard for us to guess here which path we're being called out of just
1256 * based on inode state.
1257 *
1258 * If we get an error, we must return with the inode locked and linked into the 1236 * If we get an error, we must return with the inode locked and linked into the
1259 * current transaction. This keeps things simple for the higher level code, 1237 * current transaction. This keeps things simple for the higher level code,
1260 * because it always knows that the inode is locked and held in the transaction 1238 * because it always knows that the inode is locked and held in the transaction
@@ -1262,124 +1240,31 @@ xfs_isize_check(
1262 * dirty on error so that transactions can be easily aborted if possible. 1240 * dirty on error so that transactions can be easily aborted if possible.
1263 */ 1241 */
1264int 1242int
1265xfs_itruncate_finish( 1243xfs_itruncate_extents(
1266 xfs_trans_t **tp, 1244 struct xfs_trans **tpp,
1267 xfs_inode_t *ip, 1245 struct xfs_inode *ip,
1268 xfs_fsize_t new_size, 1246 int whichfork,
1269 int fork, 1247 xfs_fsize_t new_size)
1270 int sync)
1271{ 1248{
1272 xfs_fsblock_t first_block; 1249 struct xfs_mount *mp = ip->i_mount;
1273 xfs_fileoff_t first_unmap_block; 1250 struct xfs_trans *tp = *tpp;
1274 xfs_fileoff_t last_block; 1251 struct xfs_trans *ntp;
1275 xfs_filblks_t unmap_len=0; 1252 xfs_bmap_free_t free_list;
1276 xfs_mount_t *mp; 1253 xfs_fsblock_t first_block;
1277 xfs_trans_t *ntp; 1254 xfs_fileoff_t first_unmap_block;
1278 int done; 1255 xfs_fileoff_t last_block;
1279 int committed; 1256 xfs_filblks_t unmap_len;
1280 xfs_bmap_free_t free_list; 1257 int committed;
1281 int error; 1258 int error = 0;
1259 int done = 0;
1282 1260
1283 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1261 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
1284 ASSERT((new_size == 0) || (new_size <= ip->i_size)); 1262 ASSERT(new_size <= ip->i_size);
1285 ASSERT(*tp != NULL); 1263 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1286 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 1264 ASSERT(ip->i_transp == tp);
1287 ASSERT(ip->i_transp == *tp);
1288 ASSERT(ip->i_itemp != NULL); 1265 ASSERT(ip->i_itemp != NULL);
1289 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1266 ASSERT(ip->i_itemp->ili_lock_flags == 0);
1290 1267 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1291
1292 ntp = *tp;
1293 mp = (ntp)->t_mountp;
1294 ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
1295
1296 /*
1297 * We only support truncating the entire attribute fork.
1298 */
1299 if (fork == XFS_ATTR_FORK) {
1300 new_size = 0LL;
1301 }
1302 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1303 trace_xfs_itruncate_finish_start(ip, new_size);
1304
1305 /*
1306 * The first thing we do is set the size to new_size permanently
1307 * on disk. This way we don't have to worry about anyone ever
1308 * being able to look at the data being freed even in the face
1309 * of a crash. What we're getting around here is the case where
1310 * we free a block, it is allocated to another file, it is written
1311 * to, and then we crash. If the new data gets written to the
1312 * file but the log buffers containing the free and reallocation
1313 * don't, then we'd end up with garbage in the blocks being freed.
1314 * As long as we make the new_size permanent before actually
1315 * freeing any blocks it doesn't matter if they get written to.
1316 *
1317 * The callers must signal into us whether or not the size
1318 * setting here must be synchronous. There are a few cases
1319 * where it doesn't have to be synchronous. Those cases
1320 * occur if the file is unlinked and we know the unlink is
1321 * permanent or if the blocks being truncated are guaranteed
1322 * to be beyond the inode eof (regardless of the link count)
1323 * and the eof value is permanent. Both of these cases occur
1324 * only on wsync-mounted filesystems. In those cases, we're
1325 * guaranteed that no user will ever see the data in the blocks
1326 * that are being truncated so the truncate can run async.
1327 * In the free beyond eof case, the file may wind up with
1328 * more blocks allocated to it than it needs if we crash
1329 * and that won't get fixed until the next time the file
1330 * is re-opened and closed but that's ok as that shouldn't
1331 * be too many blocks.
1332 *
1333 * However, we can't just make all wsync xactions run async
1334 * because there's one call out of the create path that needs
1335 * to run sync where it's truncating an existing file to size
1336 * 0 whose size is > 0.
1337 *
1338 * It's probably possible to come up with a test in this
1339 * routine that would correctly distinguish all the above
1340 * cases from the values of the function parameters and the
1341 * inode state but for sanity's sake, I've decided to let the
1342 * layers above just tell us. It's simpler to correctly figure
1343 * out in the layer above exactly under what conditions we
1344 * can run async and I think it's easier for others read and
1345 * follow the logic in case something has to be changed.
1346 * cscope is your friend -- rcc.
1347 *
1348 * The attribute fork is much simpler.
1349 *
1350 * For the attribute fork we allow the caller to tell us whether
1351 * the unlink of the inode that led to this call is yet permanent
1352 * in the on disk log. If it is not and we will be freeing extents
1353 * in this inode then we make the first transaction synchronous
1354 * to make sure that the unlink is permanent by the time we free
1355 * the blocks.
1356 */
1357 if (fork == XFS_DATA_FORK) {
1358 if (ip->i_d.di_nextents > 0) {
1359 /*
1360 * If we are not changing the file size then do
1361 * not update the on-disk file size - we may be
1362 * called from xfs_inactive_free_eofblocks(). If we
1363 * update the on-disk file size and then the system
1364 * crashes before the contents of the file are
1365 * flushed to disk then the files may be full of
1366 * holes (ie NULL files bug).
1367 */
1368 if (ip->i_size != new_size) {
1369 ip->i_d.di_size = new_size;
1370 ip->i_size = new_size;
1371 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1372 }
1373 }
1374 } else if (sync) {
1375 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
1376 if (ip->i_d.di_anextents > 0)
1377 xfs_trans_set_sync(ntp);
1378 }
1379 ASSERT(fork == XFS_DATA_FORK ||
1380 (fork == XFS_ATTR_FORK &&
1381 ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
1382 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
1383 1268
1384 /* 1269 /*
1385 * Since it is possible for space to become allocated beyond 1270 * Since it is possible for space to become allocated beyond
@@ -1390,128 +1275,142 @@ xfs_itruncate_finish(
1390 * beyond the maximum file size (ie it is the same as last_block), 1275 * beyond the maximum file size (ie it is the same as last_block),
1391 * then there is nothing to do. 1276 * then there is nothing to do.
1392 */ 1277 */
1278 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1393 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1279 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1394 ASSERT(first_unmap_block <= last_block); 1280 if (first_unmap_block == last_block)
1395 done = 0; 1281 return 0;
1396 if (last_block == first_unmap_block) { 1282
1397 done = 1; 1283 ASSERT(first_unmap_block < last_block);
1398 } else { 1284 unmap_len = last_block - first_unmap_block + 1;
1399 unmap_len = last_block - first_unmap_block + 1;
1400 }
1401 while (!done) { 1285 while (!done) {
1402 /*
1403 * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi()
1404 * will tell us whether it freed the entire range or
1405 * not. If this is a synchronous mount (wsync),
1406 * then we can tell bunmapi to keep all the
1407 * transactions asynchronous since the unlink
1408 * transaction that made this inode inactive has
1409 * already hit the disk. There's no danger of
1410 * the freed blocks being reused, there being a
1411 * crash, and the reused blocks suddenly reappearing
1412 * in this file with garbage in them once recovery
1413 * runs.
1414 */
1415 xfs_bmap_init(&free_list, &first_block); 1286 xfs_bmap_init(&free_list, &first_block);
1416 error = xfs_bunmapi(ntp, ip, 1287 error = xfs_bunmapi(tp, ip,
1417 first_unmap_block, unmap_len, 1288 first_unmap_block, unmap_len,
1418 xfs_bmapi_aflag(fork), 1289 xfs_bmapi_aflag(whichfork),
1419 XFS_ITRUNC_MAX_EXTENTS, 1290 XFS_ITRUNC_MAX_EXTENTS,
1420 &first_block, &free_list, 1291 &first_block, &free_list,
1421 &done); 1292 &done);
1422 if (error) { 1293 if (error)
1423 /* 1294 goto out_bmap_cancel;
1424 * If the bunmapi call encounters an error,
1425 * return to the caller where the transaction
1426 * can be properly aborted. We just need to
1427 * make sure we're not holding any resources
1428 * that we were not when we came in.
1429 */
1430 xfs_bmap_cancel(&free_list);
1431 return error;
1432 }
1433 1295
1434 /* 1296 /*
1435 * Duplicate the transaction that has the permanent 1297 * Duplicate the transaction that has the permanent
1436 * reservation and commit the old transaction. 1298 * reservation and commit the old transaction.
1437 */ 1299 */
1438 error = xfs_bmap_finish(tp, &free_list, &committed); 1300 error = xfs_bmap_finish(&tp, &free_list, &committed);
1439 ntp = *tp;
1440 if (committed) 1301 if (committed)
1441 xfs_trans_ijoin(ntp, ip); 1302 xfs_trans_ijoin(tp, ip);
1442 1303 if (error)
1443 if (error) { 1304 goto out_bmap_cancel;
1444 /*
1445 * If the bmap finish call encounters an error, return
1446 * to the caller where the transaction can be properly
1447 * aborted. We just need to make sure we're not
1448 * holding any resources that we were not when we came
1449 * in.
1450 *
1451 * Aborting from this point might lose some blocks in
1452 * the file system, but oh well.
1453 */
1454 xfs_bmap_cancel(&free_list);
1455 return error;
1456 }
1457 1305
1458 if (committed) { 1306 if (committed) {
1459 /* 1307 /*
1460 * Mark the inode dirty so it will be logged and 1308 * Mark the inode dirty so it will be logged and
1461 * moved forward in the log as part of every commit. 1309 * moved forward in the log as part of every commit.
1462 */ 1310 */
1463 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1311 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1464 } 1312 }
1465 1313
1466 ntp = xfs_trans_dup(ntp); 1314 ntp = xfs_trans_dup(tp);
1467 error = xfs_trans_commit(*tp, 0); 1315 error = xfs_trans_commit(tp, 0);
1468 *tp = ntp; 1316 tp = ntp;
1469 1317
1470 xfs_trans_ijoin(ntp, ip); 1318 xfs_trans_ijoin(tp, ip);
1471 1319
1472 if (error) 1320 if (error)
1473 return error; 1321 goto out;
1322
1474 /* 1323 /*
1475 * transaction commit worked ok so we can drop the extra ticket 1324 * Transaction commit worked ok so we can drop the extra ticket
1476 * reference that we gained in xfs_trans_dup() 1325 * reference that we gained in xfs_trans_dup()
1477 */ 1326 */
1478 xfs_log_ticket_put(ntp->t_ticket); 1327 xfs_log_ticket_put(tp->t_ticket);
1479 error = xfs_trans_reserve(ntp, 0, 1328 error = xfs_trans_reserve(tp, 0,
1480 XFS_ITRUNCATE_LOG_RES(mp), 0, 1329 XFS_ITRUNCATE_LOG_RES(mp), 0,
1481 XFS_TRANS_PERM_LOG_RES, 1330 XFS_TRANS_PERM_LOG_RES,
1482 XFS_ITRUNCATE_LOG_COUNT); 1331 XFS_ITRUNCATE_LOG_COUNT);
1483 if (error) 1332 if (error)
1484 return error; 1333 goto out;
1485 } 1334 }
1335
1336out:
1337 *tpp = tp;
1338 return error;
1339out_bmap_cancel:
1486 /* 1340 /*
1487 * Only update the size in the case of the data fork, but 1341 * If the bunmapi call encounters an error, return to the caller where
1488 * always re-log the inode so that our permanent transaction 1342 * the transaction can be properly aborted. We just need to make sure
1489 * can keep on rolling it forward in the log. 1343 * we're not holding any resources that we were not when we came in.
1490 */ 1344 */
1491 if (fork == XFS_DATA_FORK) { 1345 xfs_bmap_cancel(&free_list);
1492 xfs_isize_check(mp, ip, new_size); 1346 goto out;
1347}
1348
1349int
1350xfs_itruncate_data(
1351 struct xfs_trans **tpp,
1352 struct xfs_inode *ip,
1353 xfs_fsize_t new_size)
1354{
1355 int error;
1356
1357 trace_xfs_itruncate_data_start(ip, new_size);
1358
1359 /*
1360 * The first thing we do is set the size to new_size permanently on
1361 * disk. This way we don't have to worry about anyone ever being able
1362 * to look at the data being freed even in the face of a crash.
1363 * What we're getting around here is the case where we free a block, it
1364 * is allocated to another file, it is written to, and then we crash.
1365 * If the new data gets written to the file but the log buffers
1366 * containing the free and reallocation don't, then we'd end up with
1367 * garbage in the blocks being freed. As long as we make the new_size
1368 * permanent before actually freeing any blocks it doesn't matter if
1369 * they get written to.
1370 */
1371 if (ip->i_d.di_nextents > 0) {
1493 /* 1372 /*
1494 * If we are not changing the file size then do 1373 * If we are not changing the file size then do not update
1495 * not update the on-disk file size - we may be 1374 * the on-disk file size - we may be called from
1496 * called from xfs_inactive_free_eofblocks(). If we 1375 * xfs_inactive_free_eofblocks(). If we update the on-disk
1497 * update the on-disk file size and then the system 1376 * file size and then the system crashes before the contents
1498 * crashes before the contents of the file are 1377 * of the file are flushed to disk then the files may be
1499 * flushed to disk then the files may be full of 1378 * full of holes (ie NULL files bug).
1500 * holes (ie NULL files bug).
1501 */ 1379 */
1502 if (ip->i_size != new_size) { 1380 if (ip->i_size != new_size) {
1503 ip->i_d.di_size = new_size; 1381 ip->i_d.di_size = new_size;
1504 ip->i_size = new_size; 1382 ip->i_size = new_size;
1383 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1505 } 1384 }
1506 } 1385 }
1507 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1386
1508 ASSERT((new_size != 0) || 1387 error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
1509 (fork == XFS_ATTR_FORK) || 1388 if (error)
1510 (ip->i_delayed_blks == 0)); 1389 return error;
1511 ASSERT((new_size != 0) || 1390
1512 (fork == XFS_ATTR_FORK) || 1391 /*
1513 (ip->i_d.di_nextents == 0)); 1392 * If we are not changing the file size then do not update the on-disk
1514 trace_xfs_itruncate_finish_end(ip, new_size); 1393 * file size - we may be called from xfs_inactive_free_eofblocks().
1394 * If we update the on-disk file size and then the system crashes
1395 * before the contents of the file are flushed to disk then the files
1396 * may be full of holes (ie NULL files bug).
1397 */
1398 xfs_isize_check(ip, new_size);
1399 if (ip->i_size != new_size) {
1400 ip->i_d.di_size = new_size;
1401 ip->i_size = new_size;
1402 }
1403
1404 ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
1405 ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
1406
1407 /*
1408 * Always re-log the inode so that our permanent transaction can keep
1409 * on rolling it forward in the log.
1410 */
1411 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1412
1413 trace_xfs_itruncate_data_end(ip, new_size);
1515 return 0; 1414 return 0;
1516} 1415}
1517 1416