aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_aops.c
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2012-04-27 05:45:21 -0400
committerBen Myers <bpm@sgi.com>2012-05-14 17:20:36 -0400
commitd3bc815afb549eecb3679a4b2f0df216e34df998 (patch)
treeafff60431458ae8100fe06f64c0930da1e20d19e /fs/xfs/xfs_aops.c
parent6ffc4db5de61d36e969a26bc94509c59246c81f8 (diff)
xfs: punch new delalloc blocks out of failed writes inside EOF.
When a partial write inside EOF fails, it can leave delayed allocation blocks lying around because they don't get punched back out. This leads to assert failures like: XFS: Assertion failed: XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0, file: fs/xfs/xfs_super.c, line: 847 when evicting inodes from the cache. This can be trivially triggered by xfstests 083, which takes between 5 and 15 executions on a 512 byte block size filesystem to trip over this. Debugging shows a failed write due to ENOSPC calling xfs_vm_write_failed such as: [ 5012.329024] ino 0xa0026: vwf to 0x17000, sze 0x1c85ae and no action is taken on it. This leaves behind a delayed allocation extent that has no page covering it and no data in it: [ 5015.867162] ino 0xa0026: blks: 0x83 delay blocks 0x1, size 0x2538c0 [ 5015.868293] ext 0: off 0x4a, fsb 0x50306, len 0x1 [ 5015.869095] ext 1: off 0x4b, fsb 0x7899, len 0x6b [ 5015.869900] ext 2: off 0xb6, fsb 0xffffffffe0008, len 0x1 ^^^^^^^^^^^^^^^ [ 5015.871027] ext 3: off 0x36e, fsb 0x7a27, len 0xd [ 5015.872206] ext 4: off 0x4cf, fsb 0x7a1d, len 0xa So the delayed allocation extent is one block long at offset 0x16c00. Tracing shows that a bigger write: xfs_file_buffered_write: size 0x1c85ae offset 0x959d count 0x1ca3f ioflags allocates the block, and then fails with ENOSPC trying to allocate the last block on the page, leading to a failed write with stale delalloc blocks on it. Because we've had an ENOSPC when trying to allocate 0x16e00, it means that we are never goinge to call ->write_end on the page and so the allocated new buffer will not get marked dirty or have the buffer_new state cleared. In other works, what the above write is supposed to end up with is this mapping for the page: +------+------+------+------+------+------+------+------+ UMA UMA UMA UMA UMA UMA UND FAIL where: U = uptodate M = mapped N = new A = allocated D = delalloc FAIL = block we ENOSPC'd on. and the key point being the buffer_new() state for the newly allocated delayed allocation block. Except it doesn't - we're not marking buffers new correctly. That buffer_new() problem goes back to the xfs_iomap removal days, where xfs_iomap() used to return a "new" status for any map with newly allocated blocks, so that __xfs_get_blocks() could call set_buffer_new() on it. We still have the "new" variable and the check for it in the set_buffer_new() logic - except we never set it now! Hence that newly allocated delalloc block doesn't have the new flag set on it, so when the write fails we cannot tell which blocks we are supposed to punch out. WHy do we need the buffer_new flag? Well, that's because we can have this case: +------+------+------+------+------+------+------+------+ UMD UMD UMD UMD UMD UMD UND FAIL where all the UMD buffers contain valid data from a previously successful write() system call. We only want to punch the UND buffer because that's the only one that we added in this write and it was only this write that failed. That implies that even the old buffer_new() logic was wrong - because it would result in all those UMD buffers on the page having set_buffer_new() called on them even though they aren't new. Hence we shoul donly be calling set_buffer_new() for delalloc buffers that were allocated (i.e. were a hole before xfs_iomap_write_delay() was called). So, fix this set_buffer_new logic according to how we need it to work for handling failed writes correctly. Also, restore the new buffer logic handling for blocks allocated via xfs_iomap_write_direct(), because it should still set the buffer_new flag appropriately for newly allocated blocks, too. SO, now we have the buffer_new() being set appropriately in __xfs_get_blocks(), we can detect the exact delalloc ranges that we allocated in a failed write, and hence can now do a walk of the buffers on a page to find them. Except, it's not that easy. When block_write_begin() fails, it unlocks and releases the page that we just had an error on, so we can't use that page to handle errors anymore. We have to get access to the page while it is still locked to walk the buffers. Hence we have to open code block_write_begin() in xfs_vm_write_begin() to be able to insert xfs_vm_write_failed() is the right place. With that, we can pass the page and write range to xfs_vm_write_failed() and walk the buffers on the page, looking for delalloc buffers that are either new or beyond EOF and punch them out. Handling buffers beyond EOF ensures we still handle the existing case that xfs_vm_write_failed() handles. Of special note is the truncate_pagecache() handling - that only should be done for pages outside EOF - pages within EOF can still contain valid, dirty data so we must not punch them out of the cache. That just leaves the xfs_vm_write_end() failure handling. The only failure case here is that we didn't copy the entire range, and generic_write_end() handles that by zeroing the region of the page that wasn't copied, we don't have to punch out blocks within the file because they are guaranteed to contain zeros. Hence we only have to handle the existing "beyond EOF" case and don't need access to the buffers on the page. Hence it remains largely unchanged. Note that xfs_getbmap() can still trip over delalloc blocks beyond EOF that are left there by speculative delayed allocation. Hence this bug fix does not solve all known issues with bmap vs delalloc, but it does fix all the the known accidental occurances of the problem. Signed-off-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r--fs/xfs/xfs_aops.c173
1 files changed, 127 insertions, 46 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 139e495b0cd3..392833ccc9c6 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1187,11 +1187,18 @@ __xfs_get_blocks(
1187 &imap, nimaps); 1187 &imap, nimaps);
1188 if (error) 1188 if (error)
1189 return -error; 1189 return -error;
1190 new = 1;
1190 } else { 1191 } else {
1191 /* 1192 /*
1192 * Delalloc reservations do not require a transaction, 1193 * Delalloc reservations do not require a transaction,
1193 * we can go on without dropping the lock here. 1194 * we can go on without dropping the lock here. If we
1195 * are allocating a new delalloc block, make sure that
1196 * we set the new flag so that we mark the buffer new so
1197 * that we know that it is newly allocated if the write
1198 * fails.
1194 */ 1199 */
1200 if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1201 new = 1;
1195 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1202 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1196 if (error) 1203 if (error)
1197 goto out_unlock; 1204 goto out_unlock;
@@ -1408,52 +1415,91 @@ out_destroy_ioend:
1408 return ret; 1415 return ret;
1409} 1416}
1410 1417
1418/*
1419 * Punch out the delalloc blocks we have already allocated.
1420 *
1421 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1422 * as the page is still locked at this point.
1423 */
1424STATIC void
1425xfs_vm_kill_delalloc_range(
1426 struct inode *inode,
1427 loff_t start,
1428 loff_t end)
1429{
1430 struct xfs_inode *ip = XFS_I(inode);
1431 xfs_fileoff_t start_fsb;
1432 xfs_fileoff_t end_fsb;
1433 int error;
1434
1435 start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1436 end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1437 if (end_fsb <= start_fsb)
1438 return;
1439
1440 xfs_ilock(ip, XFS_ILOCK_EXCL);
1441 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1442 end_fsb - start_fsb);
1443 if (error) {
1444 /* something screwed, just bail */
1445 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1446 xfs_alert(ip->i_mount,
1447 "xfs_vm_write_failed: unable to clean up ino %lld",
1448 ip->i_ino);
1449 }
1450 }
1451 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1452}
1453
1411STATIC void 1454STATIC void
1412xfs_vm_write_failed( 1455xfs_vm_write_failed(
1413 struct address_space *mapping, 1456 struct inode *inode,
1414 loff_t to) 1457 struct page *page,
1458 loff_t pos,
1459 unsigned len)
1415{ 1460{
1416 struct inode *inode = mapping->host; 1461 loff_t block_offset = pos & PAGE_MASK;
1462 loff_t block_start;
1463 loff_t block_end;
1464 loff_t from = pos & (PAGE_CACHE_SIZE - 1);
1465 loff_t to = from + len;
1466 struct buffer_head *bh, *head;
1417 1467
1418 if (to > inode->i_size) { 1468 ASSERT(block_offset + from == pos);
1419 /*
1420 * Punch out the delalloc blocks we have already allocated.
1421 *
1422 * Don't bother with xfs_setattr given that nothing can have
1423 * made it to disk yet as the page is still locked at this
1424 * point.
1425 */
1426 struct xfs_inode *ip = XFS_I(inode);
1427 xfs_fileoff_t start_fsb;
1428 xfs_fileoff_t end_fsb;
1429 int error;
1430 1469
1431 truncate_pagecache(inode, to, inode->i_size); 1470 head = page_buffers(page);
1471 block_start = 0;
1472 for (bh = head; bh != head || !block_start;
1473 bh = bh->b_this_page, block_start = block_end,
1474 block_offset += bh->b_size) {
1475 block_end = block_start + bh->b_size;
1432 1476
1433 /* 1477 /* skip buffers before the write */
1434 * Check if there are any blocks that are outside of i_size 1478 if (block_end <= from)
1435 * that need to be trimmed back. 1479 continue;
1436 */ 1480
1437 start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size); 1481 /* if the buffer is after the write, we're done */
1438 end_fsb = XFS_B_TO_FSB(ip->i_mount, to); 1482 if (block_start >= to)
1439 if (end_fsb <= start_fsb) 1483 break;
1440 return; 1484
1441 1485 if (!buffer_delay(bh))
1442 xfs_ilock(ip, XFS_ILOCK_EXCL); 1486 continue;
1443 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1487
1444 end_fsb - start_fsb); 1488 if (!buffer_new(bh) && block_offset < i_size_read(inode))
1445 if (error) { 1489 continue;
1446 /* something screwed, just bail */ 1490
1447 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1491 xfs_vm_kill_delalloc_range(inode, block_offset,
1448 xfs_alert(ip->i_mount, 1492 block_offset + bh->b_size);
1449 "xfs_vm_write_failed: unable to clean up ino %lld",
1450 ip->i_ino);
1451 }
1452 }
1453 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1454 } 1493 }
1494
1455} 1495}
1456 1496
1497/*
1498 * This used to call block_write_begin(), but it unlocks and releases the page
1499 * on error, and we need that page to be able to punch stale delalloc blocks out
1500 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1501 * the appropriate point.
1502 */
1457STATIC int 1503STATIC int
1458xfs_vm_write_begin( 1504xfs_vm_write_begin(
1459 struct file *file, 1505 struct file *file,
@@ -1464,15 +1510,40 @@ xfs_vm_write_begin(
1464 struct page **pagep, 1510 struct page **pagep,
1465 void **fsdata) 1511 void **fsdata)
1466{ 1512{
1467 int ret; 1513 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1514 struct page *page;
1515 int status;
1468 1516
1469 ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS, 1517 ASSERT(len <= PAGE_CACHE_SIZE);
1470 pagep, xfs_get_blocks); 1518
1471 if (unlikely(ret)) 1519 page = grab_cache_page_write_begin(mapping, index,
1472 xfs_vm_write_failed(mapping, pos + len); 1520 flags | AOP_FLAG_NOFS);
1473 return ret; 1521 if (!page)
1522 return -ENOMEM;
1523
1524 status = __block_write_begin(page, pos, len, xfs_get_blocks);
1525 if (unlikely(status)) {
1526 struct inode *inode = mapping->host;
1527
1528 xfs_vm_write_failed(inode, page, pos, len);
1529 unlock_page(page);
1530
1531 if (pos + len > i_size_read(inode))
1532 truncate_pagecache(inode, pos + len, i_size_read(inode));
1533
1534 page_cache_release(page);
1535 page = NULL;
1536 }
1537
1538 *pagep = page;
1539 return status;
1474} 1540}
1475 1541
1542/*
1543 * On failure, we only need to kill delalloc blocks beyond EOF because they
1544 * will never be written. For blocks within EOF, generic_write_end() zeros them
1545 * so they are safe to leave alone and be written with all the other valid data.
1546 */
1476STATIC int 1547STATIC int
1477xfs_vm_write_end( 1548xfs_vm_write_end(
1478 struct file *file, 1549 struct file *file,
@@ -1485,9 +1556,19 @@ xfs_vm_write_end(
1485{ 1556{
1486 int ret; 1557 int ret;
1487 1558
1559 ASSERT(len <= PAGE_CACHE_SIZE);
1560
1488 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1561 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1489 if (unlikely(ret < len)) 1562 if (unlikely(ret < len)) {
1490 xfs_vm_write_failed(mapping, pos + len); 1563 struct inode *inode = mapping->host;
1564 size_t isize = i_size_read(inode);
1565 loff_t to = pos + len;
1566
1567 if (to > isize) {
1568 truncate_pagecache(inode, to, isize);
1569 xfs_vm_kill_delalloc_range(inode, isize, to);
1570 }
1571 }
1491 return ret; 1572 return ret;
1492} 1573}
1493 1574