diff options
author | Joel Becker <jlbec@evilplan.org> | 2011-08-22 00:02:57 -0400 |
---|---|---|
committer | Joel Becker <jlbec@evilplan.org> | 2011-08-22 00:02:57 -0400 |
commit | 99b1bb61b225c3eb4d3b196d4f1d041695b19a7e (patch) | |
tree | 06cabdc34538f3b38a39e3b802ecc1a2ab2aae00 /fs/xfs/xfs_inode.c | |
parent | c7e25e6e0b0486492c5faaf6312b37413642c48e (diff) | |
parent | 93862d5e1ab875664c6cc95254fc365028a48bb1 (diff) |
Merge branch 'mw-3.1-jul25' of git://oss.oracle.com/git/smushran/linux-2.6 into ocfs2-fixes
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r-- | fs/xfs/xfs_inode.c | 537 |
1 files changed, 137 insertions, 400 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a098a20ca63e..3cc21ddf9f7e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -37,7 +37,6 @@ | |||
37 | #include "xfs_buf_item.h" | 37 | #include "xfs_buf_item.h" |
38 | #include "xfs_inode_item.h" | 38 | #include "xfs_inode_item.h" |
39 | #include "xfs_btree.h" | 39 | #include "xfs_btree.h" |
40 | #include "xfs_btree_trace.h" | ||
41 | #include "xfs_alloc.h" | 40 | #include "xfs_alloc.h" |
42 | #include "xfs_ialloc.h" | 41 | #include "xfs_ialloc.h" |
43 | #include "xfs_bmap.h" | 42 | #include "xfs_bmap.h" |
@@ -52,7 +51,7 @@ kmem_zone_t *xfs_ifork_zone; | |||
52 | kmem_zone_t *xfs_inode_zone; | 51 | kmem_zone_t *xfs_inode_zone; |
53 | 52 | ||
54 | /* | 53 | /* |
55 | * Used in xfs_itruncate(). This is the maximum number of extents | 54 | * Used in xfs_itruncate_extents(). This is the maximum number of extents |
56 | * freed from a file in a single transaction. | 55 | * freed from a file in a single transaction. |
57 | */ | 56 | */ |
58 | #define XFS_ITRUNC_MAX_EXTENTS 2 | 57 | #define XFS_ITRUNC_MAX_EXTENTS 2 |
@@ -167,7 +166,7 @@ xfs_imap_to_bp( | |||
167 | 166 | ||
168 | dip = (xfs_dinode_t *)xfs_buf_offset(bp, | 167 | dip = (xfs_dinode_t *)xfs_buf_offset(bp, |
169 | (i << mp->m_sb.sb_inodelog)); | 168 | (i << mp->m_sb.sb_inodelog)); |
170 | di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC && | 169 | di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && |
171 | XFS_DINODE_GOOD_VERSION(dip->di_version); | 170 | XFS_DINODE_GOOD_VERSION(dip->di_version); |
172 | if (unlikely(XFS_TEST_ERROR(!di_ok, mp, | 171 | if (unlikely(XFS_TEST_ERROR(!di_ok, mp, |
173 | XFS_ERRTAG_ITOBP_INOTOBP, | 172 | XFS_ERRTAG_ITOBP_INOTOBP, |
@@ -802,7 +801,7 @@ xfs_iread( | |||
802 | * If we got something that isn't an inode it means someone | 801 | * If we got something that isn't an inode it means someone |
803 | * (nfs or dmi) has a stale handle. | 802 | * (nfs or dmi) has a stale handle. |
804 | */ | 803 | */ |
805 | if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { | 804 | if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { |
806 | #ifdef DEBUG | 805 | #ifdef DEBUG |
807 | xfs_alert(mp, | 806 | xfs_alert(mp, |
808 | "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", | 807 | "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", |
@@ -1179,15 +1178,15 @@ xfs_ialloc( | |||
1179 | * at least do it for regular files. | 1178 | * at least do it for regular files. |
1180 | */ | 1179 | */ |
1181 | #ifdef DEBUG | 1180 | #ifdef DEBUG |
1182 | void | 1181 | STATIC void |
1183 | xfs_isize_check( | 1182 | xfs_isize_check( |
1184 | xfs_mount_t *mp, | 1183 | struct xfs_inode *ip, |
1185 | xfs_inode_t *ip, | 1184 | xfs_fsize_t isize) |
1186 | xfs_fsize_t isize) | ||
1187 | { | 1185 | { |
1188 | xfs_fileoff_t map_first; | 1186 | struct xfs_mount *mp = ip->i_mount; |
1189 | int nimaps; | 1187 | xfs_fileoff_t map_first; |
1190 | xfs_bmbt_irec_t imaps[2]; | 1188 | int nimaps; |
1189 | xfs_bmbt_irec_t imaps[2]; | ||
1191 | 1190 | ||
1192 | if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) | 1191 | if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) |
1193 | return; | 1192 | return; |
@@ -1214,168 +1213,14 @@ xfs_isize_check( | |||
1214 | ASSERT(nimaps == 1); | 1213 | ASSERT(nimaps == 1); |
1215 | ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); | 1214 | ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); |
1216 | } | 1215 | } |
1216 | #else /* DEBUG */ | ||
1217 | #define xfs_isize_check(ip, isize) | ||
1217 | #endif /* DEBUG */ | 1218 | #endif /* DEBUG */ |
1218 | 1219 | ||
1219 | /* | 1220 | /* |
1220 | * Calculate the last possible buffered byte in a file. This must | 1221 | * Free up the underlying blocks past new_size. The new size must be smaller |
1221 | * include data that was buffered beyond the EOF by the write code. | 1222 | * than the current size. This routine can be used both for the attribute and |
1222 | * This also needs to deal with overflowing the xfs_fsize_t type | 1223 | * data fork, and does not modify the inode size, which is left to the caller. |
1223 | * which can happen for sizes near the limit. | ||
1224 | * | ||
1225 | * We also need to take into account any blocks beyond the EOF. It | ||
1226 | * may be the case that they were buffered by a write which failed. | ||
1227 | * In that case the pages will still be in memory, but the inode size | ||
1228 | * will never have been updated. | ||
1229 | */ | ||
1230 | STATIC xfs_fsize_t | ||
1231 | xfs_file_last_byte( | ||
1232 | xfs_inode_t *ip) | ||
1233 | { | ||
1234 | xfs_mount_t *mp; | ||
1235 | xfs_fsize_t last_byte; | ||
1236 | xfs_fileoff_t last_block; | ||
1237 | xfs_fileoff_t size_last_block; | ||
1238 | int error; | ||
1239 | |||
1240 | ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); | ||
1241 | |||
1242 | mp = ip->i_mount; | ||
1243 | /* | ||
1244 | * Only check for blocks beyond the EOF if the extents have | ||
1245 | * been read in. This eliminates the need for the inode lock, | ||
1246 | * and it also saves us from looking when it really isn't | ||
1247 | * necessary. | ||
1248 | */ | ||
1249 | if (ip->i_df.if_flags & XFS_IFEXTENTS) { | ||
1250 | xfs_ilock(ip, XFS_ILOCK_SHARED); | ||
1251 | error = xfs_bmap_last_offset(NULL, ip, &last_block, | ||
1252 | XFS_DATA_FORK); | ||
1253 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | ||
1254 | if (error) { | ||
1255 | last_block = 0; | ||
1256 | } | ||
1257 | } else { | ||
1258 | last_block = 0; | ||
1259 | } | ||
1260 | size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size); | ||
1261 | last_block = XFS_FILEOFF_MAX(last_block, size_last_block); | ||
1262 | |||
1263 | last_byte = XFS_FSB_TO_B(mp, last_block); | ||
1264 | if (last_byte < 0) { | ||
1265 | return XFS_MAXIOFFSET(mp); | ||
1266 | } | ||
1267 | last_byte += (1 << mp->m_writeio_log); | ||
1268 | if (last_byte < 0) { | ||
1269 | return XFS_MAXIOFFSET(mp); | ||
1270 | } | ||
1271 | return last_byte; | ||
1272 | } | ||
1273 | |||
1274 | /* | ||
1275 | * Start the truncation of the file to new_size. The new size | ||
1276 | * must be smaller than the current size. This routine will | ||
1277 | * clear the buffer and page caches of file data in the removed | ||
1278 | * range, and xfs_itruncate_finish() will remove the underlying | ||
1279 | * disk blocks. | ||
1280 | * | ||
1281 | * The inode must have its I/O lock locked EXCLUSIVELY, and it | ||
1282 | * must NOT have the inode lock held at all. This is because we're | ||
1283 | * calling into the buffer/page cache code and we can't hold the | ||
1284 | * inode lock when we do so. | ||
1285 | * | ||
1286 | * We need to wait for any direct I/Os in flight to complete before we | ||
1287 | * proceed with the truncate. This is needed to prevent the extents | ||
1288 | * being read or written by the direct I/Os from being removed while the | ||
1289 | * I/O is in flight as there is no other method of synchronising | ||
1290 | * direct I/O with the truncate operation. Also, because we hold | ||
1291 | * the IOLOCK in exclusive mode, we prevent new direct I/Os from being | ||
1292 | * started until the truncate completes and drops the lock. Essentially, | ||
1293 | * the xfs_ioend_wait() call forms an I/O barrier that provides strict | ||
1294 | * ordering between direct I/Os and the truncate operation. | ||
1295 | * | ||
1296 | * The flags parameter can have either the value XFS_ITRUNC_DEFINITE | ||
1297 | * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used | ||
1298 | * in the case that the caller is locking things out of order and | ||
1299 | * may not be able to call xfs_itruncate_finish() with the inode lock | ||
1300 | * held without dropping the I/O lock. If the caller must drop the | ||
1301 | * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() | ||
1302 | * must be called again with all the same restrictions as the initial | ||
1303 | * call. | ||
1304 | */ | ||
1305 | int | ||
1306 | xfs_itruncate_start( | ||
1307 | xfs_inode_t *ip, | ||
1308 | uint flags, | ||
1309 | xfs_fsize_t new_size) | ||
1310 | { | ||
1311 | xfs_fsize_t last_byte; | ||
1312 | xfs_off_t toss_start; | ||
1313 | xfs_mount_t *mp; | ||
1314 | int error = 0; | ||
1315 | |||
1316 | ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); | ||
1317 | ASSERT((new_size == 0) || (new_size <= ip->i_size)); | ||
1318 | ASSERT((flags == XFS_ITRUNC_DEFINITE) || | ||
1319 | (flags == XFS_ITRUNC_MAYBE)); | ||
1320 | |||
1321 | mp = ip->i_mount; | ||
1322 | |||
1323 | /* wait for the completion of any pending DIOs */ | ||
1324 | if (new_size == 0 || new_size < ip->i_size) | ||
1325 | xfs_ioend_wait(ip); | ||
1326 | |||
1327 | /* | ||
1328 | * Call toss_pages or flushinval_pages to get rid of pages | ||
1329 | * overlapping the region being removed. We have to use | ||
1330 | * the less efficient flushinval_pages in the case that the | ||
1331 | * caller may not be able to finish the truncate without | ||
1332 | * dropping the inode's I/O lock. Make sure | ||
1333 | * to catch any pages brought in by buffers overlapping | ||
1334 | * the EOF by searching out beyond the isize by our | ||
1335 | * block size. We round new_size up to a block boundary | ||
1336 | * so that we don't toss things on the same block as | ||
1337 | * new_size but before it. | ||
1338 | * | ||
1339 | * Before calling toss_page or flushinval_pages, make sure to | ||
1340 | * call remapf() over the same region if the file is mapped. | ||
1341 | * This frees up mapped file references to the pages in the | ||
1342 | * given range and for the flushinval_pages case it ensures | ||
1343 | * that we get the latest mapped changes flushed out. | ||
1344 | */ | ||
1345 | toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); | ||
1346 | toss_start = XFS_FSB_TO_B(mp, toss_start); | ||
1347 | if (toss_start < 0) { | ||
1348 | /* | ||
1349 | * The place to start tossing is beyond our maximum | ||
1350 | * file size, so there is no way that the data extended | ||
1351 | * out there. | ||
1352 | */ | ||
1353 | return 0; | ||
1354 | } | ||
1355 | last_byte = xfs_file_last_byte(ip); | ||
1356 | trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte); | ||
1357 | if (last_byte > toss_start) { | ||
1358 | if (flags & XFS_ITRUNC_DEFINITE) { | ||
1359 | xfs_tosspages(ip, toss_start, | ||
1360 | -1, FI_REMAPF_LOCKED); | ||
1361 | } else { | ||
1362 | error = xfs_flushinval_pages(ip, toss_start, | ||
1363 | -1, FI_REMAPF_LOCKED); | ||
1364 | } | ||
1365 | } | ||
1366 | |||
1367 | #ifdef DEBUG | ||
1368 | if (new_size == 0) { | ||
1369 | ASSERT(VN_CACHED(VFS_I(ip)) == 0); | ||
1370 | } | ||
1371 | #endif | ||
1372 | return error; | ||
1373 | } | ||
1374 | |||
1375 | /* | ||
1376 | * Shrink the file to the given new_size. The new size must be smaller than | ||
1377 | * the current size. This will free up the underlying blocks in the removed | ||
1378 | * range after a call to xfs_itruncate_start() or xfs_atruncate_start(). | ||
1379 | * | 1224 | * |
1380 | * The transaction passed to this routine must have made a permanent log | 1225 | * The transaction passed to this routine must have made a permanent log |
1381 | * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the | 1226 | * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the |
@@ -1387,31 +1232,6 @@ xfs_itruncate_start( | |||
1387 | * will be "held" within the returned transaction. This routine does NOT | 1232 | * will be "held" within the returned transaction. This routine does NOT |
1388 | * require any disk space to be reserved for it within the transaction. | 1233 | * require any disk space to be reserved for it within the transaction. |
1389 | * | 1234 | * |
1390 | * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it | ||
1391 | * indicates the fork which is to be truncated. For the attribute fork we only | ||
1392 | * support truncation to size 0. | ||
1393 | * | ||
1394 | * We use the sync parameter to indicate whether or not the first transaction | ||
1395 | * we perform might have to be synchronous. For the attr fork, it needs to be | ||
1396 | * so if the unlink of the inode is not yet known to be permanent in the log. | ||
1397 | * This keeps us from freeing and reusing the blocks of the attribute fork | ||
1398 | * before the unlink of the inode becomes permanent. | ||
1399 | * | ||
1400 | * For the data fork, we normally have to run synchronously if we're being | ||
1401 | * called out of the inactive path or we're being called out of the create path | ||
1402 | * where we're truncating an existing file. Either way, the truncate needs to | ||
1403 | * be sync so blocks don't reappear in the file with altered data in case of a | ||
1404 | * crash. wsync filesystems can run the first case async because anything that | ||
1405 | * shrinks the inode has to run sync so by the time we're called here from | ||
1406 | * inactive, the inode size is permanently set to 0. | ||
1407 | * | ||
1408 | * Calls from the truncate path always need to be sync unless we're in a wsync | ||
1409 | * filesystem and the file has already been unlinked. | ||
1410 | * | ||
1411 | * The caller is responsible for correctly setting the sync parameter. It gets | ||
1412 | * too hard for us to guess here which path we're being called out of just | ||
1413 | * based on inode state. | ||
1414 | * | ||
1415 | * If we get an error, we must return with the inode locked and linked into the | 1235 | * If we get an error, we must return with the inode locked and linked into the |
1416 | * current transaction. This keeps things simple for the higher level code, | 1236 | * current transaction. This keeps things simple for the higher level code, |
1417 | * because it always knows that the inode is locked and held in the transaction | 1237 | * because it always knows that the inode is locked and held in the transaction |
@@ -1419,124 +1239,30 @@ xfs_itruncate_start( | |||
1419 | * dirty on error so that transactions can be easily aborted if possible. | 1239 | * dirty on error so that transactions can be easily aborted if possible. |
1420 | */ | 1240 | */ |
1421 | int | 1241 | int |
1422 | xfs_itruncate_finish( | 1242 | xfs_itruncate_extents( |
1423 | xfs_trans_t **tp, | 1243 | struct xfs_trans **tpp, |
1424 | xfs_inode_t *ip, | 1244 | struct xfs_inode *ip, |
1425 | xfs_fsize_t new_size, | 1245 | int whichfork, |
1426 | int fork, | 1246 | xfs_fsize_t new_size) |
1427 | int sync) | ||
1428 | { | 1247 | { |
1429 | xfs_fsblock_t first_block; | 1248 | struct xfs_mount *mp = ip->i_mount; |
1430 | xfs_fileoff_t first_unmap_block; | 1249 | struct xfs_trans *tp = *tpp; |
1431 | xfs_fileoff_t last_block; | 1250 | struct xfs_trans *ntp; |
1432 | xfs_filblks_t unmap_len=0; | 1251 | xfs_bmap_free_t free_list; |
1433 | xfs_mount_t *mp; | 1252 | xfs_fsblock_t first_block; |
1434 | xfs_trans_t *ntp; | 1253 | xfs_fileoff_t first_unmap_block; |
1435 | int done; | 1254 | xfs_fileoff_t last_block; |
1436 | int committed; | 1255 | xfs_filblks_t unmap_len; |
1437 | xfs_bmap_free_t free_list; | 1256 | int committed; |
1438 | int error; | 1257 | int error = 0; |
1258 | int done = 0; | ||
1439 | 1259 | ||
1440 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); | 1260 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); |
1441 | ASSERT((new_size == 0) || (new_size <= ip->i_size)); | 1261 | ASSERT(new_size <= ip->i_size); |
1442 | ASSERT(*tp != NULL); | 1262 | ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); |
1443 | ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); | ||
1444 | ASSERT(ip->i_transp == *tp); | ||
1445 | ASSERT(ip->i_itemp != NULL); | 1263 | ASSERT(ip->i_itemp != NULL); |
1446 | ASSERT(ip->i_itemp->ili_lock_flags == 0); | 1264 | ASSERT(ip->i_itemp->ili_lock_flags == 0); |
1447 | 1265 | ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); | |
1448 | |||
1449 | ntp = *tp; | ||
1450 | mp = (ntp)->t_mountp; | ||
1451 | ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); | ||
1452 | |||
1453 | /* | ||
1454 | * We only support truncating the entire attribute fork. | ||
1455 | */ | ||
1456 | if (fork == XFS_ATTR_FORK) { | ||
1457 | new_size = 0LL; | ||
1458 | } | ||
1459 | first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); | ||
1460 | trace_xfs_itruncate_finish_start(ip, new_size); | ||
1461 | |||
1462 | /* | ||
1463 | * The first thing we do is set the size to new_size permanently | ||
1464 | * on disk. This way we don't have to worry about anyone ever | ||
1465 | * being able to look at the data being freed even in the face | ||
1466 | * of a crash. What we're getting around here is the case where | ||
1467 | * we free a block, it is allocated to another file, it is written | ||
1468 | * to, and then we crash. If the new data gets written to the | ||
1469 | * file but the log buffers containing the free and reallocation | ||
1470 | * don't, then we'd end up with garbage in the blocks being freed. | ||
1471 | * As long as we make the new_size permanent before actually | ||
1472 | * freeing any blocks it doesn't matter if they get written to. | ||
1473 | * | ||
1474 | * The callers must signal into us whether or not the size | ||
1475 | * setting here must be synchronous. There are a few cases | ||
1476 | * where it doesn't have to be synchronous. Those cases | ||
1477 | * occur if the file is unlinked and we know the unlink is | ||
1478 | * permanent or if the blocks being truncated are guaranteed | ||
1479 | * to be beyond the inode eof (regardless of the link count) | ||
1480 | * and the eof value is permanent. Both of these cases occur | ||
1481 | * only on wsync-mounted filesystems. In those cases, we're | ||
1482 | * guaranteed that no user will ever see the data in the blocks | ||
1483 | * that are being truncated so the truncate can run async. | ||
1484 | * In the free beyond eof case, the file may wind up with | ||
1485 | * more blocks allocated to it than it needs if we crash | ||
1486 | * and that won't get fixed until the next time the file | ||
1487 | * is re-opened and closed but that's ok as that shouldn't | ||
1488 | * be too many blocks. | ||
1489 | * | ||
1490 | * However, we can't just make all wsync xactions run async | ||
1491 | * because there's one call out of the create path that needs | ||
1492 | * to run sync where it's truncating an existing file to size | ||
1493 | * 0 whose size is > 0. | ||
1494 | * | ||
1495 | * It's probably possible to come up with a test in this | ||
1496 | * routine that would correctly distinguish all the above | ||
1497 | * cases from the values of the function parameters and the | ||
1498 | * inode state but for sanity's sake, I've decided to let the | ||
1499 | * layers above just tell us. It's simpler to correctly figure | ||
1500 | * out in the layer above exactly under what conditions we | ||
1501 | * can run async and I think it's easier for others read and | ||
1502 | * follow the logic in case something has to be changed. | ||
1503 | * cscope is your friend -- rcc. | ||
1504 | * | ||
1505 | * The attribute fork is much simpler. | ||
1506 | * | ||
1507 | * For the attribute fork we allow the caller to tell us whether | ||
1508 | * the unlink of the inode that led to this call is yet permanent | ||
1509 | * in the on disk log. If it is not and we will be freeing extents | ||
1510 | * in this inode then we make the first transaction synchronous | ||
1511 | * to make sure that the unlink is permanent by the time we free | ||
1512 | * the blocks. | ||
1513 | */ | ||
1514 | if (fork == XFS_DATA_FORK) { | ||
1515 | if (ip->i_d.di_nextents > 0) { | ||
1516 | /* | ||
1517 | * If we are not changing the file size then do | ||
1518 | * not update the on-disk file size - we may be | ||
1519 | * called from xfs_inactive_free_eofblocks(). If we | ||
1520 | * update the on-disk file size and then the system | ||
1521 | * crashes before the contents of the file are | ||
1522 | * flushed to disk then the files may be full of | ||
1523 | * holes (ie NULL files bug). | ||
1524 | */ | ||
1525 | if (ip->i_size != new_size) { | ||
1526 | ip->i_d.di_size = new_size; | ||
1527 | ip->i_size = new_size; | ||
1528 | xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); | ||
1529 | } | ||
1530 | } | ||
1531 | } else if (sync) { | ||
1532 | ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); | ||
1533 | if (ip->i_d.di_anextents > 0) | ||
1534 | xfs_trans_set_sync(ntp); | ||
1535 | } | ||
1536 | ASSERT(fork == XFS_DATA_FORK || | ||
1537 | (fork == XFS_ATTR_FORK && | ||
1538 | ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || | ||
1539 | (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); | ||
1540 | 1266 | ||
1541 | /* | 1267 | /* |
1542 | * Since it is possible for space to become allocated beyond | 1268 | * Since it is possible for space to become allocated beyond |
@@ -1547,128 +1273,142 @@ xfs_itruncate_finish( | |||
1547 | * beyond the maximum file size (ie it is the same as last_block), | 1273 | * beyond the maximum file size (ie it is the same as last_block), |
1548 | * then there is nothing to do. | 1274 | * then there is nothing to do. |
1549 | */ | 1275 | */ |
1276 | first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); | ||
1550 | last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); | 1277 | last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); |
1551 | ASSERT(first_unmap_block <= last_block); | 1278 | if (first_unmap_block == last_block) |
1552 | done = 0; | 1279 | return 0; |
1553 | if (last_block == first_unmap_block) { | 1280 | |
1554 | done = 1; | 1281 | ASSERT(first_unmap_block < last_block); |
1555 | } else { | 1282 | unmap_len = last_block - first_unmap_block + 1; |
1556 | unmap_len = last_block - first_unmap_block + 1; | ||
1557 | } | ||
1558 | while (!done) { | 1283 | while (!done) { |
1559 | /* | ||
1560 | * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() | ||
1561 | * will tell us whether it freed the entire range or | ||
1562 | * not. If this is a synchronous mount (wsync), | ||
1563 | * then we can tell bunmapi to keep all the | ||
1564 | * transactions asynchronous since the unlink | ||
1565 | * transaction that made this inode inactive has | ||
1566 | * already hit the disk. There's no danger of | ||
1567 | * the freed blocks being reused, there being a | ||
1568 | * crash, and the reused blocks suddenly reappearing | ||
1569 | * in this file with garbage in them once recovery | ||
1570 | * runs. | ||
1571 | */ | ||
1572 | xfs_bmap_init(&free_list, &first_block); | 1284 | xfs_bmap_init(&free_list, &first_block); |
1573 | error = xfs_bunmapi(ntp, ip, | 1285 | error = xfs_bunmapi(tp, ip, |
1574 | first_unmap_block, unmap_len, | 1286 | first_unmap_block, unmap_len, |
1575 | xfs_bmapi_aflag(fork), | 1287 | xfs_bmapi_aflag(whichfork), |
1576 | XFS_ITRUNC_MAX_EXTENTS, | 1288 | XFS_ITRUNC_MAX_EXTENTS, |
1577 | &first_block, &free_list, | 1289 | &first_block, &free_list, |
1578 | &done); | 1290 | &done); |
1579 | if (error) { | 1291 | if (error) |
1580 | /* | 1292 | goto out_bmap_cancel; |
1581 | * If the bunmapi call encounters an error, | ||
1582 | * return to the caller where the transaction | ||
1583 | * can be properly aborted. We just need to | ||
1584 | * make sure we're not holding any resources | ||
1585 | * that we were not when we came in. | ||
1586 | */ | ||
1587 | xfs_bmap_cancel(&free_list); | ||
1588 | return error; | ||
1589 | } | ||
1590 | 1293 | ||
1591 | /* | 1294 | /* |
1592 | * Duplicate the transaction that has the permanent | 1295 | * Duplicate the transaction that has the permanent |
1593 | * reservation and commit the old transaction. | 1296 | * reservation and commit the old transaction. |
1594 | */ | 1297 | */ |
1595 | error = xfs_bmap_finish(tp, &free_list, &committed); | 1298 | error = xfs_bmap_finish(&tp, &free_list, &committed); |
1596 | ntp = *tp; | ||
1597 | if (committed) | 1299 | if (committed) |
1598 | xfs_trans_ijoin(ntp, ip); | 1300 | xfs_trans_ijoin(tp, ip); |
1599 | 1301 | if (error) | |
1600 | if (error) { | 1302 | goto out_bmap_cancel; |
1601 | /* | ||
1602 | * If the bmap finish call encounters an error, return | ||
1603 | * to the caller where the transaction can be properly | ||
1604 | * aborted. We just need to make sure we're not | ||
1605 | * holding any resources that we were not when we came | ||
1606 | * in. | ||
1607 | * | ||
1608 | * Aborting from this point might lose some blocks in | ||
1609 | * the file system, but oh well. | ||
1610 | */ | ||
1611 | xfs_bmap_cancel(&free_list); | ||
1612 | return error; | ||
1613 | } | ||
1614 | 1303 | ||
1615 | if (committed) { | 1304 | if (committed) { |
1616 | /* | 1305 | /* |
1617 | * Mark the inode dirty so it will be logged and | 1306 | * Mark the inode dirty so it will be logged and |
1618 | * moved forward in the log as part of every commit. | 1307 | * moved forward in the log as part of every commit. |
1619 | */ | 1308 | */ |
1620 | xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); | 1309 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
1621 | } | 1310 | } |
1622 | 1311 | ||
1623 | ntp = xfs_trans_dup(ntp); | 1312 | ntp = xfs_trans_dup(tp); |
1624 | error = xfs_trans_commit(*tp, 0); | 1313 | error = xfs_trans_commit(tp, 0); |
1625 | *tp = ntp; | 1314 | tp = ntp; |
1626 | 1315 | ||
1627 | xfs_trans_ijoin(ntp, ip); | 1316 | xfs_trans_ijoin(tp, ip); |
1628 | 1317 | ||
1629 | if (error) | 1318 | if (error) |
1630 | return error; | 1319 | goto out; |
1320 | |||
1631 | /* | 1321 | /* |
1632 | * transaction commit worked ok so we can drop the extra ticket | 1322 | * Transaction commit worked ok so we can drop the extra ticket |
1633 | * reference that we gained in xfs_trans_dup() | 1323 | * reference that we gained in xfs_trans_dup() |
1634 | */ | 1324 | */ |
1635 | xfs_log_ticket_put(ntp->t_ticket); | 1325 | xfs_log_ticket_put(tp->t_ticket); |
1636 | error = xfs_trans_reserve(ntp, 0, | 1326 | error = xfs_trans_reserve(tp, 0, |
1637 | XFS_ITRUNCATE_LOG_RES(mp), 0, | 1327 | XFS_ITRUNCATE_LOG_RES(mp), 0, |
1638 | XFS_TRANS_PERM_LOG_RES, | 1328 | XFS_TRANS_PERM_LOG_RES, |
1639 | XFS_ITRUNCATE_LOG_COUNT); | 1329 | XFS_ITRUNCATE_LOG_COUNT); |
1640 | if (error) | 1330 | if (error) |
1641 | return error; | 1331 | goto out; |
1642 | } | 1332 | } |
1333 | |||
1334 | out: | ||
1335 | *tpp = tp; | ||
1336 | return error; | ||
1337 | out_bmap_cancel: | ||
1643 | /* | 1338 | /* |
1644 | * Only update the size in the case of the data fork, but | 1339 | * If the bunmapi call encounters an error, return to the caller where |
1645 | * always re-log the inode so that our permanent transaction | 1340 | * the transaction can be properly aborted. We just need to make sure |
1646 | * can keep on rolling it forward in the log. | 1341 | * we're not holding any resources that we were not when we came in. |
1647 | */ | 1342 | */ |
1648 | if (fork == XFS_DATA_FORK) { | 1343 | xfs_bmap_cancel(&free_list); |
1649 | xfs_isize_check(mp, ip, new_size); | 1344 | goto out; |
1345 | } | ||
1346 | |||
1347 | int | ||
1348 | xfs_itruncate_data( | ||
1349 | struct xfs_trans **tpp, | ||
1350 | struct xfs_inode *ip, | ||
1351 | xfs_fsize_t new_size) | ||
1352 | { | ||
1353 | int error; | ||
1354 | |||
1355 | trace_xfs_itruncate_data_start(ip, new_size); | ||
1356 | |||
1357 | /* | ||
1358 | * The first thing we do is set the size to new_size permanently on | ||
1359 | * disk. This way we don't have to worry about anyone ever being able | ||
1360 | * to look at the data being freed even in the face of a crash. | ||
1361 | * What we're getting around here is the case where we free a block, it | ||
1362 | * is allocated to another file, it is written to, and then we crash. | ||
1363 | * If the new data gets written to the file but the log buffers | ||
1364 | * containing the free and reallocation don't, then we'd end up with | ||
1365 | * garbage in the blocks being freed. As long as we make the new_size | ||
1366 | * permanent before actually freeing any blocks it doesn't matter if | ||
1367 | * they get written to. | ||
1368 | */ | ||
1369 | if (ip->i_d.di_nextents > 0) { | ||
1650 | /* | 1370 | /* |
1651 | * If we are not changing the file size then do | 1371 | * If we are not changing the file size then do not update |
1652 | * not update the on-disk file size - we may be | 1372 | * the on-disk file size - we may be called from |
1653 | * called from xfs_inactive_free_eofblocks(). If we | 1373 | * xfs_inactive_free_eofblocks(). If we update the on-disk |
1654 | * update the on-disk file size and then the system | 1374 | * file size and then the system crashes before the contents |
1655 | * crashes before the contents of the file are | 1375 | * of the file are flushed to disk then the files may be |
1656 | * flushed to disk then the files may be full of | 1376 | * full of holes (ie NULL files bug). |
1657 | * holes (ie NULL files bug). | ||
1658 | */ | 1377 | */ |
1659 | if (ip->i_size != new_size) { | 1378 | if (ip->i_size != new_size) { |
1660 | ip->i_d.di_size = new_size; | 1379 | ip->i_d.di_size = new_size; |
1661 | ip->i_size = new_size; | 1380 | ip->i_size = new_size; |
1381 | xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); | ||
1662 | } | 1382 | } |
1663 | } | 1383 | } |
1664 | xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); | 1384 | |
1665 | ASSERT((new_size != 0) || | 1385 | error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); |
1666 | (fork == XFS_ATTR_FORK) || | 1386 | if (error) |
1667 | (ip->i_delayed_blks == 0)); | 1387 | return error; |
1668 | ASSERT((new_size != 0) || | 1388 | |
1669 | (fork == XFS_ATTR_FORK) || | 1389 | /* |
1670 | (ip->i_d.di_nextents == 0)); | 1390 | * If we are not changing the file size then do not update the on-disk |
1671 | trace_xfs_itruncate_finish_end(ip, new_size); | 1391 | * file size - we may be called from xfs_inactive_free_eofblocks(). |
1392 | * If we update the on-disk file size and then the system crashes | ||
1393 | * before the contents of the file are flushed to disk then the files | ||
1394 | * may be full of holes (ie NULL files bug). | ||
1395 | */ | ||
1396 | xfs_isize_check(ip, new_size); | ||
1397 | if (ip->i_size != new_size) { | ||
1398 | ip->i_d.di_size = new_size; | ||
1399 | ip->i_size = new_size; | ||
1400 | } | ||
1401 | |||
1402 | ASSERT(new_size != 0 || ip->i_delayed_blks == 0); | ||
1403 | ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); | ||
1404 | |||
1405 | /* | ||
1406 | * Always re-log the inode so that our permanent transaction can keep | ||
1407 | * on rolling it forward in the log. | ||
1408 | */ | ||
1409 | xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); | ||
1410 | |||
1411 | trace_xfs_itruncate_data_end(ip, new_size); | ||
1672 | return 0; | 1412 | return 0; |
1673 | } | 1413 | } |
1674 | 1414 | ||
@@ -1694,7 +1434,6 @@ xfs_iunlink( | |||
1694 | 1434 | ||
1695 | ASSERT(ip->i_d.di_nlink == 0); | 1435 | ASSERT(ip->i_d.di_nlink == 0); |
1696 | ASSERT(ip->i_d.di_mode != 0); | 1436 | ASSERT(ip->i_d.di_mode != 0); |
1697 | ASSERT(ip->i_transp == tp); | ||
1698 | 1437 | ||
1699 | mp = tp->t_mountp; | 1438 | mp = tp->t_mountp; |
1700 | 1439 | ||
@@ -1717,7 +1456,7 @@ xfs_iunlink( | |||
1717 | ASSERT(agi->agi_unlinked[bucket_index]); | 1456 | ASSERT(agi->agi_unlinked[bucket_index]); |
1718 | ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); | 1457 | ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); |
1719 | 1458 | ||
1720 | if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { | 1459 | if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { |
1721 | /* | 1460 | /* |
1722 | * There is already another inode in the bucket we need | 1461 | * There is already another inode in the bucket we need |
1723 | * to add ourselves to. Add us at the front of the list. | 1462 | * to add ourselves to. Add us at the front of the list. |
@@ -1728,8 +1467,7 @@ xfs_iunlink( | |||
1728 | if (error) | 1467 | if (error) |
1729 | return error; | 1468 | return error; |
1730 | 1469 | ||
1731 | ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); | 1470 | ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); |
1732 | /* both on-disk, don't endian flip twice */ | ||
1733 | dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; | 1471 | dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; |
1734 | offset = ip->i_imap.im_boffset + | 1472 | offset = ip->i_imap.im_boffset + |
1735 | offsetof(xfs_dinode_t, di_next_unlinked); | 1473 | offsetof(xfs_dinode_t, di_next_unlinked); |
@@ -1794,7 +1532,7 @@ xfs_iunlink_remove( | |||
1794 | agino = XFS_INO_TO_AGINO(mp, ip->i_ino); | 1532 | agino = XFS_INO_TO_AGINO(mp, ip->i_ino); |
1795 | ASSERT(agino != 0); | 1533 | ASSERT(agino != 0); |
1796 | bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; | 1534 | bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; |
1797 | ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); | 1535 | ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); |
1798 | ASSERT(agi->agi_unlinked[bucket_index]); | 1536 | ASSERT(agi->agi_unlinked[bucket_index]); |
1799 | 1537 | ||
1800 | if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { | 1538 | if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { |
@@ -1959,7 +1697,7 @@ xfs_ifree_cluster( | |||
1959 | * stale first, we will not attempt to lock them in the loop | 1697 | * stale first, we will not attempt to lock them in the loop |
1960 | * below as the XFS_ISTALE flag will be set. | 1698 | * below as the XFS_ISTALE flag will be set. |
1961 | */ | 1699 | */ |
1962 | lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); | 1700 | lip = bp->b_fspriv; |
1963 | while (lip) { | 1701 | while (lip) { |
1964 | if (lip->li_type == XFS_LI_INODE) { | 1702 | if (lip->li_type == XFS_LI_INODE) { |
1965 | iip = (xfs_inode_log_item_t *)lip; | 1703 | iip = (xfs_inode_log_item_t *)lip; |
@@ -2086,7 +1824,6 @@ xfs_ifree( | |||
2086 | xfs_buf_t *ibp; | 1824 | xfs_buf_t *ibp; |
2087 | 1825 | ||
2088 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); | 1826 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); |
2089 | ASSERT(ip->i_transp == tp); | ||
2090 | ASSERT(ip->i_d.di_nlink == 0); | 1827 | ASSERT(ip->i_d.di_nlink == 0); |
2091 | ASSERT(ip->i_d.di_nextents == 0); | 1828 | ASSERT(ip->i_d.di_nextents == 0); |
2092 | ASSERT(ip->i_d.di_anextents == 0); | 1829 | ASSERT(ip->i_d.di_anextents == 0); |
@@ -2733,7 +2470,7 @@ cluster_corrupt_out: | |||
2733 | * mark the buffer as an error and call them. Otherwise | 2470 | * mark the buffer as an error and call them. Otherwise |
2734 | * mark it as stale and brelse. | 2471 | * mark it as stale and brelse. |
2735 | */ | 2472 | */ |
2736 | if (XFS_BUF_IODONE_FUNC(bp)) { | 2473 | if (bp->b_iodone) { |
2737 | XFS_BUF_UNDONE(bp); | 2474 | XFS_BUF_UNDONE(bp); |
2738 | XFS_BUF_STALE(bp); | 2475 | XFS_BUF_STALE(bp); |
2739 | XFS_BUF_ERROR(bp,EIO); | 2476 | XFS_BUF_ERROR(bp,EIO); |
@@ -2920,7 +2657,7 @@ xfs_iflush_int( | |||
2920 | */ | 2657 | */ |
2921 | xfs_synchronize_times(ip); | 2658 | xfs_synchronize_times(ip); |
2922 | 2659 | ||
2923 | if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, | 2660 | if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), |
2924 | mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { | 2661 | mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { |
2925 | xfs_alert_tag(mp, XFS_PTAG_IFLUSH, | 2662 | xfs_alert_tag(mp, XFS_PTAG_IFLUSH, |
2926 | "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", | 2663 | "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", |
@@ -3073,8 +2810,8 @@ xfs_iflush_int( | |||
3073 | */ | 2810 | */ |
3074 | xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); | 2811 | xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); |
3075 | 2812 | ||
3076 | ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); | 2813 | ASSERT(bp->b_fspriv != NULL); |
3077 | ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); | 2814 | ASSERT(bp->b_iodone != NULL); |
3078 | } else { | 2815 | } else { |
3079 | /* | 2816 | /* |
3080 | * We're flushing an inode which is not in the AIL and has | 2817 | * We're flushing an inode which is not in the AIL and has |