aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
authorJoel Becker <jlbec@evilplan.org>2011-08-22 00:02:57 -0400
committerJoel Becker <jlbec@evilplan.org>2011-08-22 00:02:57 -0400
commit99b1bb61b225c3eb4d3b196d4f1d041695b19a7e (patch)
tree06cabdc34538f3b38a39e3b802ecc1a2ab2aae00 /fs/xfs/xfs_inode.c
parentc7e25e6e0b0486492c5faaf6312b37413642c48e (diff)
parent93862d5e1ab875664c6cc95254fc365028a48bb1 (diff)
Merge branch 'mw-3.1-jul25' of git://oss.oracle.com/git/smushran/linux-2.6 into ocfs2-fixes
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c537
1 files changed, 137 insertions, 400 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a098a20ca63e..3cc21ddf9f7e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -37,7 +37,6 @@
37#include "xfs_buf_item.h" 37#include "xfs_buf_item.h"
38#include "xfs_inode_item.h" 38#include "xfs_inode_item.h"
39#include "xfs_btree.h" 39#include "xfs_btree.h"
40#include "xfs_btree_trace.h"
41#include "xfs_alloc.h" 40#include "xfs_alloc.h"
42#include "xfs_ialloc.h" 41#include "xfs_ialloc.h"
43#include "xfs_bmap.h" 42#include "xfs_bmap.h"
@@ -52,7 +51,7 @@ kmem_zone_t *xfs_ifork_zone;
52kmem_zone_t *xfs_inode_zone; 51kmem_zone_t *xfs_inode_zone;
53 52
54/* 53/*
55 * Used in xfs_itruncate(). This is the maximum number of extents 54 * Used in xfs_itruncate_extents(). This is the maximum number of extents
56 * freed from a file in a single transaction. 55 * freed from a file in a single transaction.
57 */ 56 */
58#define XFS_ITRUNC_MAX_EXTENTS 2 57#define XFS_ITRUNC_MAX_EXTENTS 2
@@ -167,7 +166,7 @@ xfs_imap_to_bp(
167 166
168 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 167 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
169 (i << mp->m_sb.sb_inodelog)); 168 (i << mp->m_sb.sb_inodelog));
170 di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC && 169 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
171 XFS_DINODE_GOOD_VERSION(dip->di_version); 170 XFS_DINODE_GOOD_VERSION(dip->di_version);
172 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 171 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
173 XFS_ERRTAG_ITOBP_INOTOBP, 172 XFS_ERRTAG_ITOBP_INOTOBP,
@@ -802,7 +801,7 @@ xfs_iread(
802 * If we got something that isn't an inode it means someone 801 * If we got something that isn't an inode it means someone
803 * (nfs or dmi) has a stale handle. 802 * (nfs or dmi) has a stale handle.
804 */ 803 */
805 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { 804 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
806#ifdef DEBUG 805#ifdef DEBUG
807 xfs_alert(mp, 806 xfs_alert(mp,
808 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", 807 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
@@ -1179,15 +1178,15 @@ xfs_ialloc(
1179 * at least do it for regular files. 1178 * at least do it for regular files.
1180 */ 1179 */
1181#ifdef DEBUG 1180#ifdef DEBUG
1182void 1181STATIC void
1183xfs_isize_check( 1182xfs_isize_check(
1184 xfs_mount_t *mp, 1183 struct xfs_inode *ip,
1185 xfs_inode_t *ip, 1184 xfs_fsize_t isize)
1186 xfs_fsize_t isize)
1187{ 1185{
1188 xfs_fileoff_t map_first; 1186 struct xfs_mount *mp = ip->i_mount;
1189 int nimaps; 1187 xfs_fileoff_t map_first;
1190 xfs_bmbt_irec_t imaps[2]; 1188 int nimaps;
1189 xfs_bmbt_irec_t imaps[2];
1191 1190
1192 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1191 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
1193 return; 1192 return;
@@ -1214,168 +1213,14 @@ xfs_isize_check(
1214 ASSERT(nimaps == 1); 1213 ASSERT(nimaps == 1);
1215 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1214 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1216} 1215}
1216#else /* DEBUG */
1217#define xfs_isize_check(ip, isize)
1217#endif /* DEBUG */ 1218#endif /* DEBUG */
1218 1219
1219/* 1220/*
1220 * Calculate the last possible buffered byte in a file. This must 1221 * Free up the underlying blocks past new_size. The new size must be smaller
1221 * include data that was buffered beyond the EOF by the write code. 1222 * than the current size. This routine can be used both for the attribute and
1222 * This also needs to deal with overflowing the xfs_fsize_t type 1223 * data fork, and does not modify the inode size, which is left to the caller.
1223 * which can happen for sizes near the limit.
1224 *
1225 * We also need to take into account any blocks beyond the EOF. It
1226 * may be the case that they were buffered by a write which failed.
1227 * In that case the pages will still be in memory, but the inode size
1228 * will never have been updated.
1229 */
1230STATIC xfs_fsize_t
1231xfs_file_last_byte(
1232 xfs_inode_t *ip)
1233{
1234 xfs_mount_t *mp;
1235 xfs_fsize_t last_byte;
1236 xfs_fileoff_t last_block;
1237 xfs_fileoff_t size_last_block;
1238 int error;
1239
1240 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
1241
1242 mp = ip->i_mount;
1243 /*
1244 * Only check for blocks beyond the EOF if the extents have
1245 * been read in. This eliminates the need for the inode lock,
1246 * and it also saves us from looking when it really isn't
1247 * necessary.
1248 */
1249 if (ip->i_df.if_flags & XFS_IFEXTENTS) {
1250 xfs_ilock(ip, XFS_ILOCK_SHARED);
1251 error = xfs_bmap_last_offset(NULL, ip, &last_block,
1252 XFS_DATA_FORK);
1253 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1254 if (error) {
1255 last_block = 0;
1256 }
1257 } else {
1258 last_block = 0;
1259 }
1260 size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size);
1261 last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
1262
1263 last_byte = XFS_FSB_TO_B(mp, last_block);
1264 if (last_byte < 0) {
1265 return XFS_MAXIOFFSET(mp);
1266 }
1267 last_byte += (1 << mp->m_writeio_log);
1268 if (last_byte < 0) {
1269 return XFS_MAXIOFFSET(mp);
1270 }
1271 return last_byte;
1272}
1273
1274/*
1275 * Start the truncation of the file to new_size. The new size
1276 * must be smaller than the current size. This routine will
1277 * clear the buffer and page caches of file data in the removed
1278 * range, and xfs_itruncate_finish() will remove the underlying
1279 * disk blocks.
1280 *
1281 * The inode must have its I/O lock locked EXCLUSIVELY, and it
1282 * must NOT have the inode lock held at all. This is because we're
1283 * calling into the buffer/page cache code and we can't hold the
1284 * inode lock when we do so.
1285 *
1286 * We need to wait for any direct I/Os in flight to complete before we
1287 * proceed with the truncate. This is needed to prevent the extents
1288 * being read or written by the direct I/Os from being removed while the
1289 * I/O is in flight as there is no other method of synchronising
1290 * direct I/O with the truncate operation. Also, because we hold
1291 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
1292 * started until the truncate completes and drops the lock. Essentially,
1293 * the xfs_ioend_wait() call forms an I/O barrier that provides strict
1294 * ordering between direct I/Os and the truncate operation.
1295 *
1296 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
1297 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used
1298 * in the case that the caller is locking things out of order and
1299 * may not be able to call xfs_itruncate_finish() with the inode lock
1300 * held without dropping the I/O lock. If the caller must drop the
1301 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
1302 * must be called again with all the same restrictions as the initial
1303 * call.
1304 */
1305int
1306xfs_itruncate_start(
1307 xfs_inode_t *ip,
1308 uint flags,
1309 xfs_fsize_t new_size)
1310{
1311 xfs_fsize_t last_byte;
1312 xfs_off_t toss_start;
1313 xfs_mount_t *mp;
1314 int error = 0;
1315
1316 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1317 ASSERT((new_size == 0) || (new_size <= ip->i_size));
1318 ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
1319 (flags == XFS_ITRUNC_MAYBE));
1320
1321 mp = ip->i_mount;
1322
1323 /* wait for the completion of any pending DIOs */
1324 if (new_size == 0 || new_size < ip->i_size)
1325 xfs_ioend_wait(ip);
1326
1327 /*
1328 * Call toss_pages or flushinval_pages to get rid of pages
1329 * overlapping the region being removed. We have to use
1330 * the less efficient flushinval_pages in the case that the
1331 * caller may not be able to finish the truncate without
1332 * dropping the inode's I/O lock. Make sure
1333 * to catch any pages brought in by buffers overlapping
1334 * the EOF by searching out beyond the isize by our
1335 * block size. We round new_size up to a block boundary
1336 * so that we don't toss things on the same block as
1337 * new_size but before it.
1338 *
1339 * Before calling toss_page or flushinval_pages, make sure to
1340 * call remapf() over the same region if the file is mapped.
1341 * This frees up mapped file references to the pages in the
1342 * given range and for the flushinval_pages case it ensures
1343 * that we get the latest mapped changes flushed out.
1344 */
1345 toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1346 toss_start = XFS_FSB_TO_B(mp, toss_start);
1347 if (toss_start < 0) {
1348 /*
1349 * The place to start tossing is beyond our maximum
1350 * file size, so there is no way that the data extended
1351 * out there.
1352 */
1353 return 0;
1354 }
1355 last_byte = xfs_file_last_byte(ip);
1356 trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
1357 if (last_byte > toss_start) {
1358 if (flags & XFS_ITRUNC_DEFINITE) {
1359 xfs_tosspages(ip, toss_start,
1360 -1, FI_REMAPF_LOCKED);
1361 } else {
1362 error = xfs_flushinval_pages(ip, toss_start,
1363 -1, FI_REMAPF_LOCKED);
1364 }
1365 }
1366
1367#ifdef DEBUG
1368 if (new_size == 0) {
1369 ASSERT(VN_CACHED(VFS_I(ip)) == 0);
1370 }
1371#endif
1372 return error;
1373}
1374
1375/*
1376 * Shrink the file to the given new_size. The new size must be smaller than
1377 * the current size. This will free up the underlying blocks in the removed
1378 * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
1379 * 1224 *
1380 * The transaction passed to this routine must have made a permanent log 1225 * The transaction passed to this routine must have made a permanent log
1381 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1226 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
@@ -1387,31 +1232,6 @@ xfs_itruncate_start(
1387 * will be "held" within the returned transaction. This routine does NOT 1232 * will be "held" within the returned transaction. This routine does NOT
1388 * require any disk space to be reserved for it within the transaction. 1233 * require any disk space to be reserved for it within the transaction.
1389 * 1234 *
1390 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
1391 * indicates the fork which is to be truncated. For the attribute fork we only
1392 * support truncation to size 0.
1393 *
1394 * We use the sync parameter to indicate whether or not the first transaction
1395 * we perform might have to be synchronous. For the attr fork, it needs to be
1396 * so if the unlink of the inode is not yet known to be permanent in the log.
1397 * This keeps us from freeing and reusing the blocks of the attribute fork
1398 * before the unlink of the inode becomes permanent.
1399 *
1400 * For the data fork, we normally have to run synchronously if we're being
1401 * called out of the inactive path or we're being called out of the create path
1402 * where we're truncating an existing file. Either way, the truncate needs to
1403 * be sync so blocks don't reappear in the file with altered data in case of a
1404 * crash. wsync filesystems can run the first case async because anything that
1405 * shrinks the inode has to run sync so by the time we're called here from
1406 * inactive, the inode size is permanently set to 0.
1407 *
1408 * Calls from the truncate path always need to be sync unless we're in a wsync
1409 * filesystem and the file has already been unlinked.
1410 *
1411 * The caller is responsible for correctly setting the sync parameter. It gets
1412 * too hard for us to guess here which path we're being called out of just
1413 * based on inode state.
1414 *
1415 * If we get an error, we must return with the inode locked and linked into the 1235 * If we get an error, we must return with the inode locked and linked into the
1416 * current transaction. This keeps things simple for the higher level code, 1236 * current transaction. This keeps things simple for the higher level code,
1417 * because it always knows that the inode is locked and held in the transaction 1237 * because it always knows that the inode is locked and held in the transaction
@@ -1419,124 +1239,30 @@ xfs_itruncate_start(
1419 * dirty on error so that transactions can be easily aborted if possible. 1239 * dirty on error so that transactions can be easily aborted if possible.
1420 */ 1240 */
1421int 1241int
1422xfs_itruncate_finish( 1242xfs_itruncate_extents(
1423 xfs_trans_t **tp, 1243 struct xfs_trans **tpp,
1424 xfs_inode_t *ip, 1244 struct xfs_inode *ip,
1425 xfs_fsize_t new_size, 1245 int whichfork,
1426 int fork, 1246 xfs_fsize_t new_size)
1427 int sync)
1428{ 1247{
1429 xfs_fsblock_t first_block; 1248 struct xfs_mount *mp = ip->i_mount;
1430 xfs_fileoff_t first_unmap_block; 1249 struct xfs_trans *tp = *tpp;
1431 xfs_fileoff_t last_block; 1250 struct xfs_trans *ntp;
1432 xfs_filblks_t unmap_len=0; 1251 xfs_bmap_free_t free_list;
1433 xfs_mount_t *mp; 1252 xfs_fsblock_t first_block;
1434 xfs_trans_t *ntp; 1253 xfs_fileoff_t first_unmap_block;
1435 int done; 1254 xfs_fileoff_t last_block;
1436 int committed; 1255 xfs_filblks_t unmap_len;
1437 xfs_bmap_free_t free_list; 1256 int committed;
1438 int error; 1257 int error = 0;
1258 int done = 0;
1439 1259
1440 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1260 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
1441 ASSERT((new_size == 0) || (new_size <= ip->i_size)); 1261 ASSERT(new_size <= ip->i_size);
1442 ASSERT(*tp != NULL); 1262 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1443 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1444 ASSERT(ip->i_transp == *tp);
1445 ASSERT(ip->i_itemp != NULL); 1263 ASSERT(ip->i_itemp != NULL);
1446 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1264 ASSERT(ip->i_itemp->ili_lock_flags == 0);
1447 1265 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1448
1449 ntp = *tp;
1450 mp = (ntp)->t_mountp;
1451 ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
1452
1453 /*
1454 * We only support truncating the entire attribute fork.
1455 */
1456 if (fork == XFS_ATTR_FORK) {
1457 new_size = 0LL;
1458 }
1459 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1460 trace_xfs_itruncate_finish_start(ip, new_size);
1461
1462 /*
1463 * The first thing we do is set the size to new_size permanently
1464 * on disk. This way we don't have to worry about anyone ever
1465 * being able to look at the data being freed even in the face
1466 * of a crash. What we're getting around here is the case where
1467 * we free a block, it is allocated to another file, it is written
1468 * to, and then we crash. If the new data gets written to the
1469 * file but the log buffers containing the free and reallocation
1470 * don't, then we'd end up with garbage in the blocks being freed.
1471 * As long as we make the new_size permanent before actually
1472 * freeing any blocks it doesn't matter if they get written to.
1473 *
1474 * The callers must signal into us whether or not the size
1475 * setting here must be synchronous. There are a few cases
1476 * where it doesn't have to be synchronous. Those cases
1477 * occur if the file is unlinked and we know the unlink is
1478 * permanent or if the blocks being truncated are guaranteed
1479 * to be beyond the inode eof (regardless of the link count)
1480 * and the eof value is permanent. Both of these cases occur
1481 * only on wsync-mounted filesystems. In those cases, we're
1482 * guaranteed that no user will ever see the data in the blocks
1483 * that are being truncated so the truncate can run async.
1484 * In the free beyond eof case, the file may wind up with
1485 * more blocks allocated to it than it needs if we crash
1486 * and that won't get fixed until the next time the file
1487 * is re-opened and closed but that's ok as that shouldn't
1488 * be too many blocks.
1489 *
1490 * However, we can't just make all wsync xactions run async
1491 * because there's one call out of the create path that needs
1492 * to run sync where it's truncating an existing file to size
1493 * 0 whose size is > 0.
1494 *
1495 * It's probably possible to come up with a test in this
1496 * routine that would correctly distinguish all the above
1497 * cases from the values of the function parameters and the
1498 * inode state but for sanity's sake, I've decided to let the
1499 * layers above just tell us. It's simpler to correctly figure
1500 * out in the layer above exactly under what conditions we
1501 * can run async and I think it's easier for others read and
1502 * follow the logic in case something has to be changed.
1503 * cscope is your friend -- rcc.
1504 *
1505 * The attribute fork is much simpler.
1506 *
1507 * For the attribute fork we allow the caller to tell us whether
1508 * the unlink of the inode that led to this call is yet permanent
1509 * in the on disk log. If it is not and we will be freeing extents
1510 * in this inode then we make the first transaction synchronous
1511 * to make sure that the unlink is permanent by the time we free
1512 * the blocks.
1513 */
1514 if (fork == XFS_DATA_FORK) {
1515 if (ip->i_d.di_nextents > 0) {
1516 /*
1517 * If we are not changing the file size then do
1518 * not update the on-disk file size - we may be
1519 * called from xfs_inactive_free_eofblocks(). If we
1520 * update the on-disk file size and then the system
1521 * crashes before the contents of the file are
1522 * flushed to disk then the files may be full of
1523 * holes (ie NULL files bug).
1524 */
1525 if (ip->i_size != new_size) {
1526 ip->i_d.di_size = new_size;
1527 ip->i_size = new_size;
1528 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1529 }
1530 }
1531 } else if (sync) {
1532 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
1533 if (ip->i_d.di_anextents > 0)
1534 xfs_trans_set_sync(ntp);
1535 }
1536 ASSERT(fork == XFS_DATA_FORK ||
1537 (fork == XFS_ATTR_FORK &&
1538 ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
1539 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
1540 1266
1541 /* 1267 /*
1542 * Since it is possible for space to become allocated beyond 1268 * Since it is possible for space to become allocated beyond
@@ -1547,128 +1273,142 @@ xfs_itruncate_finish(
1547 * beyond the maximum file size (ie it is the same as last_block), 1273 * beyond the maximum file size (ie it is the same as last_block),
1548 * then there is nothing to do. 1274 * then there is nothing to do.
1549 */ 1275 */
1276 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1550 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1277 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1551 ASSERT(first_unmap_block <= last_block); 1278 if (first_unmap_block == last_block)
1552 done = 0; 1279 return 0;
1553 if (last_block == first_unmap_block) { 1280
1554 done = 1; 1281 ASSERT(first_unmap_block < last_block);
1555 } else { 1282 unmap_len = last_block - first_unmap_block + 1;
1556 unmap_len = last_block - first_unmap_block + 1;
1557 }
1558 while (!done) { 1283 while (!done) {
1559 /*
1560 * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi()
1561 * will tell us whether it freed the entire range or
1562 * not. If this is a synchronous mount (wsync),
1563 * then we can tell bunmapi to keep all the
1564 * transactions asynchronous since the unlink
1565 * transaction that made this inode inactive has
1566 * already hit the disk. There's no danger of
1567 * the freed blocks being reused, there being a
1568 * crash, and the reused blocks suddenly reappearing
1569 * in this file with garbage in them once recovery
1570 * runs.
1571 */
1572 xfs_bmap_init(&free_list, &first_block); 1284 xfs_bmap_init(&free_list, &first_block);
1573 error = xfs_bunmapi(ntp, ip, 1285 error = xfs_bunmapi(tp, ip,
1574 first_unmap_block, unmap_len, 1286 first_unmap_block, unmap_len,
1575 xfs_bmapi_aflag(fork), 1287 xfs_bmapi_aflag(whichfork),
1576 XFS_ITRUNC_MAX_EXTENTS, 1288 XFS_ITRUNC_MAX_EXTENTS,
1577 &first_block, &free_list, 1289 &first_block, &free_list,
1578 &done); 1290 &done);
1579 if (error) { 1291 if (error)
1580 /* 1292 goto out_bmap_cancel;
1581 * If the bunmapi call encounters an error,
1582 * return to the caller where the transaction
1583 * can be properly aborted. We just need to
1584 * make sure we're not holding any resources
1585 * that we were not when we came in.
1586 */
1587 xfs_bmap_cancel(&free_list);
1588 return error;
1589 }
1590 1293
1591 /* 1294 /*
1592 * Duplicate the transaction that has the permanent 1295 * Duplicate the transaction that has the permanent
1593 * reservation and commit the old transaction. 1296 * reservation and commit the old transaction.
1594 */ 1297 */
1595 error = xfs_bmap_finish(tp, &free_list, &committed); 1298 error = xfs_bmap_finish(&tp, &free_list, &committed);
1596 ntp = *tp;
1597 if (committed) 1299 if (committed)
1598 xfs_trans_ijoin(ntp, ip); 1300 xfs_trans_ijoin(tp, ip);
1599 1301 if (error)
1600 if (error) { 1302 goto out_bmap_cancel;
1601 /*
1602 * If the bmap finish call encounters an error, return
1603 * to the caller where the transaction can be properly
1604 * aborted. We just need to make sure we're not
1605 * holding any resources that we were not when we came
1606 * in.
1607 *
1608 * Aborting from this point might lose some blocks in
1609 * the file system, but oh well.
1610 */
1611 xfs_bmap_cancel(&free_list);
1612 return error;
1613 }
1614 1303
1615 if (committed) { 1304 if (committed) {
1616 /* 1305 /*
1617 * Mark the inode dirty so it will be logged and 1306 * Mark the inode dirty so it will be logged and
1618 * moved forward in the log as part of every commit. 1307 * moved forward in the log as part of every commit.
1619 */ 1308 */
1620 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1309 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1621 } 1310 }
1622 1311
1623 ntp = xfs_trans_dup(ntp); 1312 ntp = xfs_trans_dup(tp);
1624 error = xfs_trans_commit(*tp, 0); 1313 error = xfs_trans_commit(tp, 0);
1625 *tp = ntp; 1314 tp = ntp;
1626 1315
1627 xfs_trans_ijoin(ntp, ip); 1316 xfs_trans_ijoin(tp, ip);
1628 1317
1629 if (error) 1318 if (error)
1630 return error; 1319 goto out;
1320
1631 /* 1321 /*
1632 * transaction commit worked ok so we can drop the extra ticket 1322 * Transaction commit worked ok so we can drop the extra ticket
1633 * reference that we gained in xfs_trans_dup() 1323 * reference that we gained in xfs_trans_dup()
1634 */ 1324 */
1635 xfs_log_ticket_put(ntp->t_ticket); 1325 xfs_log_ticket_put(tp->t_ticket);
1636 error = xfs_trans_reserve(ntp, 0, 1326 error = xfs_trans_reserve(tp, 0,
1637 XFS_ITRUNCATE_LOG_RES(mp), 0, 1327 XFS_ITRUNCATE_LOG_RES(mp), 0,
1638 XFS_TRANS_PERM_LOG_RES, 1328 XFS_TRANS_PERM_LOG_RES,
1639 XFS_ITRUNCATE_LOG_COUNT); 1329 XFS_ITRUNCATE_LOG_COUNT);
1640 if (error) 1330 if (error)
1641 return error; 1331 goto out;
1642 } 1332 }
1333
1334out:
1335 *tpp = tp;
1336 return error;
1337out_bmap_cancel:
1643 /* 1338 /*
1644 * Only update the size in the case of the data fork, but 1339 * If the bunmapi call encounters an error, return to the caller where
1645 * always re-log the inode so that our permanent transaction 1340 * the transaction can be properly aborted. We just need to make sure
1646 * can keep on rolling it forward in the log. 1341 * we're not holding any resources that we were not when we came in.
1647 */ 1342 */
1648 if (fork == XFS_DATA_FORK) { 1343 xfs_bmap_cancel(&free_list);
1649 xfs_isize_check(mp, ip, new_size); 1344 goto out;
1345}
1346
1347int
1348xfs_itruncate_data(
1349 struct xfs_trans **tpp,
1350 struct xfs_inode *ip,
1351 xfs_fsize_t new_size)
1352{
1353 int error;
1354
1355 trace_xfs_itruncate_data_start(ip, new_size);
1356
1357 /*
1358 * The first thing we do is set the size to new_size permanently on
1359 * disk. This way we don't have to worry about anyone ever being able
1360 * to look at the data being freed even in the face of a crash.
1361 * What we're getting around here is the case where we free a block, it
1362 * is allocated to another file, it is written to, and then we crash.
1363 * If the new data gets written to the file but the log buffers
1364 * containing the free and reallocation don't, then we'd end up with
1365 * garbage in the blocks being freed. As long as we make the new_size
1366 * permanent before actually freeing any blocks it doesn't matter if
1367 * they get written to.
1368 */
1369 if (ip->i_d.di_nextents > 0) {
1650 /* 1370 /*
1651 * If we are not changing the file size then do 1371 * If we are not changing the file size then do not update
1652 * not update the on-disk file size - we may be 1372 * the on-disk file size - we may be called from
1653 * called from xfs_inactive_free_eofblocks(). If we 1373 * xfs_inactive_free_eofblocks(). If we update the on-disk
1654 * update the on-disk file size and then the system 1374 * file size and then the system crashes before the contents
1655 * crashes before the contents of the file are 1375 * of the file are flushed to disk then the files may be
1656 * flushed to disk then the files may be full of 1376 * full of holes (ie NULL files bug).
1657 * holes (ie NULL files bug).
1658 */ 1377 */
1659 if (ip->i_size != new_size) { 1378 if (ip->i_size != new_size) {
1660 ip->i_d.di_size = new_size; 1379 ip->i_d.di_size = new_size;
1661 ip->i_size = new_size; 1380 ip->i_size = new_size;
1381 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1662 } 1382 }
1663 } 1383 }
1664 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1384
1665 ASSERT((new_size != 0) || 1385 error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
1666 (fork == XFS_ATTR_FORK) || 1386 if (error)
1667 (ip->i_delayed_blks == 0)); 1387 return error;
1668 ASSERT((new_size != 0) || 1388
1669 (fork == XFS_ATTR_FORK) || 1389 /*
1670 (ip->i_d.di_nextents == 0)); 1390 * If we are not changing the file size then do not update the on-disk
1671 trace_xfs_itruncate_finish_end(ip, new_size); 1391 * file size - we may be called from xfs_inactive_free_eofblocks().
1392 * If we update the on-disk file size and then the system crashes
1393 * before the contents of the file are flushed to disk then the files
1394 * may be full of holes (ie NULL files bug).
1395 */
1396 xfs_isize_check(ip, new_size);
1397 if (ip->i_size != new_size) {
1398 ip->i_d.di_size = new_size;
1399 ip->i_size = new_size;
1400 }
1401
1402 ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
1403 ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
1404
1405 /*
1406 * Always re-log the inode so that our permanent transaction can keep
1407 * on rolling it forward in the log.
1408 */
1409 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1410
1411 trace_xfs_itruncate_data_end(ip, new_size);
1672 return 0; 1412 return 0;
1673} 1413}
1674 1414
@@ -1694,7 +1434,6 @@ xfs_iunlink(
1694 1434
1695 ASSERT(ip->i_d.di_nlink == 0); 1435 ASSERT(ip->i_d.di_nlink == 0);
1696 ASSERT(ip->i_d.di_mode != 0); 1436 ASSERT(ip->i_d.di_mode != 0);
1697 ASSERT(ip->i_transp == tp);
1698 1437
1699 mp = tp->t_mountp; 1438 mp = tp->t_mountp;
1700 1439
@@ -1717,7 +1456,7 @@ xfs_iunlink(
1717 ASSERT(agi->agi_unlinked[bucket_index]); 1456 ASSERT(agi->agi_unlinked[bucket_index]);
1718 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1457 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1719 1458
1720 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { 1459 if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1721 /* 1460 /*
1722 * There is already another inode in the bucket we need 1461 * There is already another inode in the bucket we need
1723 * to add ourselves to. Add us at the front of the list. 1462 * to add ourselves to. Add us at the front of the list.
@@ -1728,8 +1467,7 @@ xfs_iunlink(
1728 if (error) 1467 if (error)
1729 return error; 1468 return error;
1730 1469
1731 ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); 1470 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1732 /* both on-disk, don't endian flip twice */
1733 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1471 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1734 offset = ip->i_imap.im_boffset + 1472 offset = ip->i_imap.im_boffset +
1735 offsetof(xfs_dinode_t, di_next_unlinked); 1473 offsetof(xfs_dinode_t, di_next_unlinked);
@@ -1794,7 +1532,7 @@ xfs_iunlink_remove(
1794 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1532 agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1795 ASSERT(agino != 0); 1533 ASSERT(agino != 0);
1796 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1534 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1797 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); 1535 ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
1798 ASSERT(agi->agi_unlinked[bucket_index]); 1536 ASSERT(agi->agi_unlinked[bucket_index]);
1799 1537
1800 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 1538 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
@@ -1959,7 +1697,7 @@ xfs_ifree_cluster(
1959 * stale first, we will not attempt to lock them in the loop 1697 * stale first, we will not attempt to lock them in the loop
1960 * below as the XFS_ISTALE flag will be set. 1698 * below as the XFS_ISTALE flag will be set.
1961 */ 1699 */
1962 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1700 lip = bp->b_fspriv;
1963 while (lip) { 1701 while (lip) {
1964 if (lip->li_type == XFS_LI_INODE) { 1702 if (lip->li_type == XFS_LI_INODE) {
1965 iip = (xfs_inode_log_item_t *)lip; 1703 iip = (xfs_inode_log_item_t *)lip;
@@ -2086,7 +1824,6 @@ xfs_ifree(
2086 xfs_buf_t *ibp; 1824 xfs_buf_t *ibp;
2087 1825
2088 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1826 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2089 ASSERT(ip->i_transp == tp);
2090 ASSERT(ip->i_d.di_nlink == 0); 1827 ASSERT(ip->i_d.di_nlink == 0);
2091 ASSERT(ip->i_d.di_nextents == 0); 1828 ASSERT(ip->i_d.di_nextents == 0);
2092 ASSERT(ip->i_d.di_anextents == 0); 1829 ASSERT(ip->i_d.di_anextents == 0);
@@ -2733,7 +2470,7 @@ cluster_corrupt_out:
2733 * mark the buffer as an error and call them. Otherwise 2470 * mark the buffer as an error and call them. Otherwise
2734 * mark it as stale and brelse. 2471 * mark it as stale and brelse.
2735 */ 2472 */
2736 if (XFS_BUF_IODONE_FUNC(bp)) { 2473 if (bp->b_iodone) {
2737 XFS_BUF_UNDONE(bp); 2474 XFS_BUF_UNDONE(bp);
2738 XFS_BUF_STALE(bp); 2475 XFS_BUF_STALE(bp);
2739 XFS_BUF_ERROR(bp,EIO); 2476 XFS_BUF_ERROR(bp,EIO);
@@ -2920,7 +2657,7 @@ xfs_iflush_int(
2920 */ 2657 */
2921 xfs_synchronize_times(ip); 2658 xfs_synchronize_times(ip);
2922 2659
2923 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 2660 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
2924 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2661 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2925 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2662 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2926 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2663 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
@@ -3073,8 +2810,8 @@ xfs_iflush_int(
3073 */ 2810 */
3074 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 2811 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3075 2812
3076 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 2813 ASSERT(bp->b_fspriv != NULL);
3077 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); 2814 ASSERT(bp->b_iodone != NULL);
3078 } else { 2815 } else {
3079 /* 2816 /*
3080 * We're flushing an inode which is not in the AIL and has 2817 * We're flushing an inode which is not in the AIL and has