aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_aops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r--fs/xfs/xfs_aops.c332
1 files changed, 48 insertions, 284 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 87d2b215cbbd..7575cfc3ad15 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
87 * We're now finished for good with this page. Update the page state via the 87 * We're now finished for good with this page. Update the page state via the
88 * associated buffer_heads, paying attention to the start and end offsets that 88 * associated buffer_heads, paying attention to the start and end offsets that
89 * we need to process on the page. 89 * we need to process on the page.
90 *
91 * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
92 * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
93 * the page at all, as we may be racing with memory reclaim and it can free both
94 * the bufferhead chain and the page as it will see the page as clean and
95 * unused.
90 */ 96 */
91static void 97static void
92xfs_finish_page_writeback( 98xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
95 int error) 101 int error)
96{ 102{
97 unsigned int end = bvec->bv_offset + bvec->bv_len - 1; 103 unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
98 struct buffer_head *head, *bh; 104 struct buffer_head *head, *bh, *next;
99 unsigned int off = 0; 105 unsigned int off = 0;
106 unsigned int bsize;
100 107
101 ASSERT(bvec->bv_offset < PAGE_SIZE); 108 ASSERT(bvec->bv_offset < PAGE_SIZE);
102 ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); 109 ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
105 112
106 bh = head = page_buffers(bvec->bv_page); 113 bh = head = page_buffers(bvec->bv_page);
107 114
115 bsize = bh->b_size;
108 do { 116 do {
117 next = bh->b_this_page;
109 if (off < bvec->bv_offset) 118 if (off < bvec->bv_offset)
110 goto next_bh; 119 goto next_bh;
111 if (off > end) 120 if (off > end)
112 break; 121 break;
113 bh->b_end_io(bh, !error); 122 bh->b_end_io(bh, !error);
114next_bh: 123next_bh:
115 off += bh->b_size; 124 off += bsize;
116 } while ((bh = bh->b_this_page) != head); 125 } while ((bh = next) != head);
117} 126}
118 127
119/* 128/*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
1041 1050
1042 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1051 trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1043 1052
1053 /*
1054 * mm accommodates an old ext3 case where clean pages might not have had
1055 * the dirty bit cleared. Thus, it can send actual dirty pages to
1056 * ->releasepage() via shrink_active_list(). Conversely,
1057 * block_invalidatepage() can send pages that are still marked dirty
1058 * but otherwise have invalidated buffers.
1059 *
1060 * We've historically freed buffers on the latter. Instead, quietly
1061 * filter out all dirty pages to avoid spurious buffer state warnings.
1062 * This can likely be removed once shrink_active_list() is fixed.
1063 */
1064 if (PageDirty(page))
1065 return 0;
1066
1044 xfs_count_page_state(page, &delalloc, &unwritten); 1067 xfs_count_page_state(page, &delalloc, &unwritten);
1045 1068
1046 if (WARN_ON_ONCE(delalloc)) 1069 if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
1144 ssize_t size; 1167 ssize_t size;
1145 int new = 0; 1168 int new = 0;
1146 1169
1170 BUG_ON(create && !direct);
1171
1147 if (XFS_FORCED_SHUTDOWN(mp)) 1172 if (XFS_FORCED_SHUTDOWN(mp))
1148 return -EIO; 1173 return -EIO;
1149 1174
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
1151 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1176 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1152 size = bh_result->b_size; 1177 size = bh_result->b_size;
1153 1178
1154 if (!create && direct && offset >= i_size_read(inode)) 1179 if (!create && offset >= i_size_read(inode))
1155 return 0; 1180 return 0;
1156 1181
1157 /* 1182 /*
1158 * Direct I/O is usually done on preallocated files, so try getting 1183 * Direct I/O is usually done on preallocated files, so try getting
1159 * a block mapping without an exclusive lock first. For buffered 1184 * a block mapping without an exclusive lock first.
1160 * writes we already have the exclusive iolock anyway, so avoiding
1161 * a lock roundtrip here by taking the ilock exclusive from the
1162 * beginning is a useful micro optimization.
1163 */ 1185 */
1164 if (create && !direct) { 1186 lockmode = xfs_ilock_data_map_shared(ip);
1165 lockmode = XFS_ILOCK_EXCL;
1166 xfs_ilock(ip, lockmode);
1167 } else {
1168 lockmode = xfs_ilock_data_map_shared(ip);
1169 }
1170 1187
1171 ASSERT(offset <= mp->m_super->s_maxbytes); 1188 ASSERT(offset <= mp->m_super->s_maxbytes);
1172 if (offset + size > mp->m_super->s_maxbytes) 1189 if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
1185 (imap.br_startblock == HOLESTARTBLOCK || 1202 (imap.br_startblock == HOLESTARTBLOCK ||
1186 imap.br_startblock == DELAYSTARTBLOCK) || 1203 imap.br_startblock == DELAYSTARTBLOCK) ||
1187 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { 1204 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1188 if (direct || xfs_get_extsz_hint(ip)) { 1205 /*
1189 /* 1206 * xfs_iomap_write_direct() expects the shared lock. It
1190 * xfs_iomap_write_direct() expects the shared lock. It 1207 * is unlocked on return.
1191 * is unlocked on return. 1208 */
1192 */ 1209 if (lockmode == XFS_ILOCK_EXCL)
1193 if (lockmode == XFS_ILOCK_EXCL) 1210 xfs_ilock_demote(ip, lockmode);
1194 xfs_ilock_demote(ip, lockmode);
1195
1196 error = xfs_iomap_write_direct(ip, offset, size,
1197 &imap, nimaps);
1198 if (error)
1199 return error;
1200 new = 1;
1201 1211
1202 } else { 1212 error = xfs_iomap_write_direct(ip, offset, size,
1203 /* 1213 &imap, nimaps);
1204 * Delalloc reservations do not require a transaction, 1214 if (error)
1205 * we can go on without dropping the lock here. If we 1215 return error;
1206 * are allocating a new delalloc block, make sure that 1216 new = 1;
1207 * we set the new flag so that we mark the buffer new so
1208 * that we know that it is newly allocated if the write
1209 * fails.
1210 */
1211 if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1212 new = 1;
1213 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1214 if (error)
1215 goto out_unlock;
1216 1217
1217 xfs_iunlock(ip, lockmode);
1218 }
1219 trace_xfs_get_blocks_alloc(ip, offset, size, 1218 trace_xfs_get_blocks_alloc(ip, offset, size,
1220 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1219 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1221 : XFS_IO_DELALLOC, &imap); 1220 : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
1236 } 1235 }
1237 1236
1238 /* trim mapping down to size requested */ 1237 /* trim mapping down to size requested */
1239 if (direct || size > (1 << inode->i_blkbits)) 1238 xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1240 xfs_map_trim_size(inode, iblock, bh_result,
1241 &imap, offset, size);
1242 1239
1243 /* 1240 /*
1244 * For unwritten extents do not report a disk address in the buffered 1241 * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
1251 if (ISUNWRITTEN(&imap)) 1248 if (ISUNWRITTEN(&imap))
1252 set_buffer_unwritten(bh_result); 1249 set_buffer_unwritten(bh_result);
1253 /* direct IO needs special help */ 1250 /* direct IO needs special help */
1254 if (create && direct) { 1251 if (create) {
1255 if (dax_fault) 1252 if (dax_fault)
1256 ASSERT(!ISUNWRITTEN(&imap)); 1253 ASSERT(!ISUNWRITTEN(&imap));
1257 else 1254 else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
1280 (new || ISUNWRITTEN(&imap)))) 1277 (new || ISUNWRITTEN(&imap))))
1281 set_buffer_new(bh_result); 1278 set_buffer_new(bh_result);
1282 1279
1283 if (imap.br_startblock == DELAYSTARTBLOCK) { 1280 BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
1284 BUG_ON(direct);
1285 if (create) {
1286 set_buffer_uptodate(bh_result);
1287 set_buffer_mapped(bh_result);
1288 set_buffer_delay(bh_result);
1289 }
1290 }
1291 1281
1292 return 0; 1282 return 0;
1293 1283
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
1337 * whereas if we have flags set we will always be called in task context 1327 * whereas if we have flags set we will always be called in task context
1338 * (i.e. from a workqueue). 1328 * (i.e. from a workqueue).
1339 */ 1329 */
1340STATIC int 1330int
1341xfs_end_io_direct_write( 1331xfs_end_io_direct_write(
1342 struct kiocb *iocb, 1332 struct kiocb *iocb,
1343 loff_t offset, 1333 loff_t offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
1408 struct kiocb *iocb, 1398 struct kiocb *iocb,
1409 struct iov_iter *iter) 1399 struct iov_iter *iter)
1410{ 1400{
1411 struct inode *inode = iocb->ki_filp->f_mapping->host;
1412 dio_iodone_t *endio = NULL;
1413 int flags = 0;
1414 struct block_device *bdev;
1415
1416 if (iov_iter_rw(iter) == WRITE) {
1417 endio = xfs_end_io_direct_write;
1418 flags = DIO_ASYNC_EXTEND;
1419 }
1420
1421 if (IS_DAX(inode)) {
1422 return dax_do_io(iocb, inode, iter,
1423 xfs_get_blocks_direct, endio, 0);
1424 }
1425
1426 bdev = xfs_find_bdev_for_inode(inode);
1427 return __blockdev_direct_IO(iocb, inode, bdev, iter,
1428 xfs_get_blocks_direct, endio, NULL, flags);
1429}
1430
1431/*
1432 * Punch out the delalloc blocks we have already allocated.
1433 *
1434 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1435 * as the page is still locked at this point.
1436 */
1437STATIC void
1438xfs_vm_kill_delalloc_range(
1439 struct inode *inode,
1440 loff_t start,
1441 loff_t end)
1442{
1443 struct xfs_inode *ip = XFS_I(inode);
1444 xfs_fileoff_t start_fsb;
1445 xfs_fileoff_t end_fsb;
1446 int error;
1447
1448 start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1449 end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1450 if (end_fsb <= start_fsb)
1451 return;
1452
1453 xfs_ilock(ip, XFS_ILOCK_EXCL);
1454 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1455 end_fsb - start_fsb);
1456 if (error) {
1457 /* something screwed, just bail */
1458 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1459 xfs_alert(ip->i_mount,
1460 "xfs_vm_write_failed: unable to clean up ino %lld",
1461 ip->i_ino);
1462 }
1463 }
1464 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1465}
1466
1467STATIC void
1468xfs_vm_write_failed(
1469 struct inode *inode,
1470 struct page *page,
1471 loff_t pos,
1472 unsigned len)
1473{
1474 loff_t block_offset;
1475 loff_t block_start;
1476 loff_t block_end;
1477 loff_t from = pos & (PAGE_SIZE - 1);
1478 loff_t to = from + len;
1479 struct buffer_head *bh, *head;
1480 struct xfs_mount *mp = XFS_I(inode)->i_mount;
1481
1482 /* 1401 /*
1483 * The request pos offset might be 32 or 64 bit, this is all fine 1402 * We just need the method present so that open/fcntl allow direct I/O.
1484 * on 64-bit platform. However, for 64-bit pos request on 32-bit
1485 * platform, the high 32-bit will be masked off if we evaluate the
1486 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1487 * 0xfffff000 as an unsigned long, hence the result is incorrect
1488 * which could cause the following ASSERT failed in most cases.
1489 * In order to avoid this, we can evaluate the block_offset of the
1490 * start of the page by using shifts rather than masks the mismatch
1491 * problem.
1492 */ 1403 */
1493 block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; 1404 return -EINVAL;
1494
1495 ASSERT(block_offset + from == pos);
1496
1497 head = page_buffers(page);
1498 block_start = 0;
1499 for (bh = head; bh != head || !block_start;
1500 bh = bh->b_this_page, block_start = block_end,
1501 block_offset += bh->b_size) {
1502 block_end = block_start + bh->b_size;
1503
1504 /* skip buffers before the write */
1505 if (block_end <= from)
1506 continue;
1507
1508 /* if the buffer is after the write, we're done */
1509 if (block_start >= to)
1510 break;
1511
1512 /*
1513 * Process delalloc and unwritten buffers beyond EOF. We can
1514 * encounter unwritten buffers in the event that a file has
1515 * post-EOF unwritten extents and an extending write happens to
1516 * fail (e.g., an unaligned write that also involves a delalloc
1517 * to the same page).
1518 */
1519 if (!buffer_delay(bh) && !buffer_unwritten(bh))
1520 continue;
1521
1522 if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
1523 block_offset < i_size_read(inode))
1524 continue;
1525
1526 if (buffer_delay(bh))
1527 xfs_vm_kill_delalloc_range(inode, block_offset,
1528 block_offset + bh->b_size);
1529
1530 /*
1531 * This buffer does not contain data anymore. make sure anyone
1532 * who finds it knows that for certain.
1533 */
1534 clear_buffer_delay(bh);
1535 clear_buffer_uptodate(bh);
1536 clear_buffer_mapped(bh);
1537 clear_buffer_new(bh);
1538 clear_buffer_dirty(bh);
1539 clear_buffer_unwritten(bh);
1540 }
1541
1542}
1543
1544/*
1545 * This used to call block_write_begin(), but it unlocks and releases the page
1546 * on error, and we need that page to be able to punch stale delalloc blocks out
1547 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1548 * the appropriate point.
1549 */
1550STATIC int
1551xfs_vm_write_begin(
1552 struct file *file,
1553 struct address_space *mapping,
1554 loff_t pos,
1555 unsigned len,
1556 unsigned flags,
1557 struct page **pagep,
1558 void **fsdata)
1559{
1560 pgoff_t index = pos >> PAGE_SHIFT;
1561 struct page *page;
1562 int status;
1563 struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
1564
1565 ASSERT(len <= PAGE_SIZE);
1566
1567 page = grab_cache_page_write_begin(mapping, index, flags);
1568 if (!page)
1569 return -ENOMEM;
1570
1571 status = __block_write_begin(page, pos, len, xfs_get_blocks);
1572 if (xfs_mp_fail_writes(mp))
1573 status = -EIO;
1574 if (unlikely(status)) {
1575 struct inode *inode = mapping->host;
1576 size_t isize = i_size_read(inode);
1577
1578 xfs_vm_write_failed(inode, page, pos, len);
1579 unlock_page(page);
1580
1581 /*
1582 * If the write is beyond EOF, we only want to kill blocks
1583 * allocated in this write, not blocks that were previously
1584 * written successfully.
1585 */
1586 if (xfs_mp_fail_writes(mp))
1587 isize = 0;
1588 if (pos + len > isize) {
1589 ssize_t start = max_t(ssize_t, pos, isize);
1590
1591 truncate_pagecache_range(inode, start, pos + len);
1592 }
1593
1594 put_page(page);
1595 page = NULL;
1596 }
1597
1598 *pagep = page;
1599 return status;
1600}
1601
1602/*
1603 * On failure, we only need to kill delalloc blocks beyond EOF in the range of
1604 * this specific write because they will never be written. Previous writes
1605 * beyond EOF where block allocation succeeded do not need to be trashed, so
1606 * only new blocks from this write should be trashed. For blocks within
1607 * EOF, generic_write_end() zeros them so they are safe to leave alone and be
1608 * written with all the other valid data.
1609 */
1610STATIC int
1611xfs_vm_write_end(
1612 struct file *file,
1613 struct address_space *mapping,
1614 loff_t pos,
1615 unsigned len,
1616 unsigned copied,
1617 struct page *page,
1618 void *fsdata)
1619{
1620 int ret;
1621
1622 ASSERT(len <= PAGE_SIZE);
1623
1624 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1625 if (unlikely(ret < len)) {
1626 struct inode *inode = mapping->host;
1627 size_t isize = i_size_read(inode);
1628 loff_t to = pos + len;
1629
1630 if (to > isize) {
1631 /* only kill blocks in this write beyond EOF */
1632 if (pos > isize)
1633 isize = pos;
1634 xfs_vm_kill_delalloc_range(inode, isize, to);
1635 truncate_pagecache_range(inode, isize, to);
1636 }
1637 }
1638 return ret;
1639} 1405}
1640 1406
1641STATIC sector_t 1407STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
1748 .set_page_dirty = xfs_vm_set_page_dirty, 1514 .set_page_dirty = xfs_vm_set_page_dirty,
1749 .releasepage = xfs_vm_releasepage, 1515 .releasepage = xfs_vm_releasepage,
1750 .invalidatepage = xfs_vm_invalidatepage, 1516 .invalidatepage = xfs_vm_invalidatepage,
1751 .write_begin = xfs_vm_write_begin,
1752 .write_end = xfs_vm_write_end,
1753 .bmap = xfs_vm_bmap, 1517 .bmap = xfs_vm_bmap,
1754 .direct_IO = xfs_vm_direct_IO, 1518 .direct_IO = xfs_vm_direct_IO,
1755 .migratepage = buffer_migrate_page, 1519 .migratepage = buffer_migrate_page,