diff options
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r-- | fs/xfs/xfs_aops.c | 332 |
1 files changed, 48 insertions, 284 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 87d2b215cbbd..7575cfc3ad15 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode( | |||
87 | * We're now finished for good with this page. Update the page state via the | 87 | * We're now finished for good with this page. Update the page state via the |
88 | * associated buffer_heads, paying attention to the start and end offsets that | 88 | * associated buffer_heads, paying attention to the start and end offsets that |
89 | * we need to process on the page. | 89 | * we need to process on the page. |
90 | * | ||
91 | * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last | ||
92 | * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or | ||
93 | * the page at all, as we may be racing with memory reclaim and it can free both | ||
94 | * the bufferhead chain and the page as it will see the page as clean and | ||
95 | * unused. | ||
90 | */ | 96 | */ |
91 | static void | 97 | static void |
92 | xfs_finish_page_writeback( | 98 | xfs_finish_page_writeback( |
@@ -95,8 +101,9 @@ xfs_finish_page_writeback( | |||
95 | int error) | 101 | int error) |
96 | { | 102 | { |
97 | unsigned int end = bvec->bv_offset + bvec->bv_len - 1; | 103 | unsigned int end = bvec->bv_offset + bvec->bv_len - 1; |
98 | struct buffer_head *head, *bh; | 104 | struct buffer_head *head, *bh, *next; |
99 | unsigned int off = 0; | 105 | unsigned int off = 0; |
106 | unsigned int bsize; | ||
100 | 107 | ||
101 | ASSERT(bvec->bv_offset < PAGE_SIZE); | 108 | ASSERT(bvec->bv_offset < PAGE_SIZE); |
102 | ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); | 109 | ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); |
@@ -105,15 +112,17 @@ xfs_finish_page_writeback( | |||
105 | 112 | ||
106 | bh = head = page_buffers(bvec->bv_page); | 113 | bh = head = page_buffers(bvec->bv_page); |
107 | 114 | ||
115 | bsize = bh->b_size; | ||
108 | do { | 116 | do { |
117 | next = bh->b_this_page; | ||
109 | if (off < bvec->bv_offset) | 118 | if (off < bvec->bv_offset) |
110 | goto next_bh; | 119 | goto next_bh; |
111 | if (off > end) | 120 | if (off > end) |
112 | break; | 121 | break; |
113 | bh->b_end_io(bh, !error); | 122 | bh->b_end_io(bh, !error); |
114 | next_bh: | 123 | next_bh: |
115 | off += bh->b_size; | 124 | off += bsize; |
116 | } while ((bh = bh->b_this_page) != head); | 125 | } while ((bh = next) != head); |
117 | } | 126 | } |
118 | 127 | ||
119 | /* | 128 | /* |
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage( | |||
1041 | 1050 | ||
1042 | trace_xfs_releasepage(page->mapping->host, page, 0, 0); | 1051 | trace_xfs_releasepage(page->mapping->host, page, 0, 0); |
1043 | 1052 | ||
1053 | /* | ||
1054 | * mm accommodates an old ext3 case where clean pages might not have had | ||
1055 | * the dirty bit cleared. Thus, it can send actual dirty pages to | ||
1056 | * ->releasepage() via shrink_active_list(). Conversely, | ||
1057 | * block_invalidatepage() can send pages that are still marked dirty | ||
1058 | * but otherwise have invalidated buffers. | ||
1059 | * | ||
1060 | * We've historically freed buffers on the latter. Instead, quietly | ||
1061 | * filter out all dirty pages to avoid spurious buffer state warnings. | ||
1062 | * This can likely be removed once shrink_active_list() is fixed. | ||
1063 | */ | ||
1064 | if (PageDirty(page)) | ||
1065 | return 0; | ||
1066 | |||
1044 | xfs_count_page_state(page, &delalloc, &unwritten); | 1067 | xfs_count_page_state(page, &delalloc, &unwritten); |
1045 | 1068 | ||
1046 | if (WARN_ON_ONCE(delalloc)) | 1069 | if (WARN_ON_ONCE(delalloc)) |
@@ -1144,6 +1167,8 @@ __xfs_get_blocks( | |||
1144 | ssize_t size; | 1167 | ssize_t size; |
1145 | int new = 0; | 1168 | int new = 0; |
1146 | 1169 | ||
1170 | BUG_ON(create && !direct); | ||
1171 | |||
1147 | if (XFS_FORCED_SHUTDOWN(mp)) | 1172 | if (XFS_FORCED_SHUTDOWN(mp)) |
1148 | return -EIO; | 1173 | return -EIO; |
1149 | 1174 | ||
@@ -1151,22 +1176,14 @@ __xfs_get_blocks( | |||
1151 | ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); | 1176 | ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); |
1152 | size = bh_result->b_size; | 1177 | size = bh_result->b_size; |
1153 | 1178 | ||
1154 | if (!create && direct && offset >= i_size_read(inode)) | 1179 | if (!create && offset >= i_size_read(inode)) |
1155 | return 0; | 1180 | return 0; |
1156 | 1181 | ||
1157 | /* | 1182 | /* |
1158 | * Direct I/O is usually done on preallocated files, so try getting | 1183 | * Direct I/O is usually done on preallocated files, so try getting |
1159 | * a block mapping without an exclusive lock first. For buffered | 1184 | * a block mapping without an exclusive lock first. |
1160 | * writes we already have the exclusive iolock anyway, so avoiding | ||
1161 | * a lock roundtrip here by taking the ilock exclusive from the | ||
1162 | * beginning is a useful micro optimization. | ||
1163 | */ | 1185 | */ |
1164 | if (create && !direct) { | 1186 | lockmode = xfs_ilock_data_map_shared(ip); |
1165 | lockmode = XFS_ILOCK_EXCL; | ||
1166 | xfs_ilock(ip, lockmode); | ||
1167 | } else { | ||
1168 | lockmode = xfs_ilock_data_map_shared(ip); | ||
1169 | } | ||
1170 | 1187 | ||
1171 | ASSERT(offset <= mp->m_super->s_maxbytes); | 1188 | ASSERT(offset <= mp->m_super->s_maxbytes); |
1172 | if (offset + size > mp->m_super->s_maxbytes) | 1189 | if (offset + size > mp->m_super->s_maxbytes) |
@@ -1185,37 +1202,19 @@ __xfs_get_blocks( | |||
1185 | (imap.br_startblock == HOLESTARTBLOCK || | 1202 | (imap.br_startblock == HOLESTARTBLOCK || |
1186 | imap.br_startblock == DELAYSTARTBLOCK) || | 1203 | imap.br_startblock == DELAYSTARTBLOCK) || |
1187 | (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { | 1204 | (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { |
1188 | if (direct || xfs_get_extsz_hint(ip)) { | 1205 | /* |
1189 | /* | 1206 | * xfs_iomap_write_direct() expects the shared lock. It |
1190 | * xfs_iomap_write_direct() expects the shared lock. It | 1207 | * is unlocked on return. |
1191 | * is unlocked on return. | 1208 | */ |
1192 | */ | 1209 | if (lockmode == XFS_ILOCK_EXCL) |
1193 | if (lockmode == XFS_ILOCK_EXCL) | 1210 | xfs_ilock_demote(ip, lockmode); |
1194 | xfs_ilock_demote(ip, lockmode); | ||
1195 | |||
1196 | error = xfs_iomap_write_direct(ip, offset, size, | ||
1197 | &imap, nimaps); | ||
1198 | if (error) | ||
1199 | return error; | ||
1200 | new = 1; | ||
1201 | 1211 | ||
1202 | } else { | 1212 | error = xfs_iomap_write_direct(ip, offset, size, |
1203 | /* | 1213 | &imap, nimaps); |
1204 | * Delalloc reservations do not require a transaction, | 1214 | if (error) |
1205 | * we can go on without dropping the lock here. If we | 1215 | return error; |
1206 | * are allocating a new delalloc block, make sure that | 1216 | new = 1; |
1207 | * we set the new flag so that we mark the buffer new so | ||
1208 | * that we know that it is newly allocated if the write | ||
1209 | * fails. | ||
1210 | */ | ||
1211 | if (nimaps && imap.br_startblock == HOLESTARTBLOCK) | ||
1212 | new = 1; | ||
1213 | error = xfs_iomap_write_delay(ip, offset, size, &imap); | ||
1214 | if (error) | ||
1215 | goto out_unlock; | ||
1216 | 1217 | ||
1217 | xfs_iunlock(ip, lockmode); | ||
1218 | } | ||
1219 | trace_xfs_get_blocks_alloc(ip, offset, size, | 1218 | trace_xfs_get_blocks_alloc(ip, offset, size, |
1220 | ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN | 1219 | ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN |
1221 | : XFS_IO_DELALLOC, &imap); | 1220 | : XFS_IO_DELALLOC, &imap); |
@@ -1236,9 +1235,7 @@ __xfs_get_blocks( | |||
1236 | } | 1235 | } |
1237 | 1236 | ||
1238 | /* trim mapping down to size requested */ | 1237 | /* trim mapping down to size requested */ |
1239 | if (direct || size > (1 << inode->i_blkbits)) | 1238 | xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); |
1240 | xfs_map_trim_size(inode, iblock, bh_result, | ||
1241 | &imap, offset, size); | ||
1242 | 1239 | ||
1243 | /* | 1240 | /* |
1244 | * For unwritten extents do not report a disk address in the buffered | 1241 | * For unwritten extents do not report a disk address in the buffered |
@@ -1251,7 +1248,7 @@ __xfs_get_blocks( | |||
1251 | if (ISUNWRITTEN(&imap)) | 1248 | if (ISUNWRITTEN(&imap)) |
1252 | set_buffer_unwritten(bh_result); | 1249 | set_buffer_unwritten(bh_result); |
1253 | /* direct IO needs special help */ | 1250 | /* direct IO needs special help */ |
1254 | if (create && direct) { | 1251 | if (create) { |
1255 | if (dax_fault) | 1252 | if (dax_fault) |
1256 | ASSERT(!ISUNWRITTEN(&imap)); | 1253 | ASSERT(!ISUNWRITTEN(&imap)); |
1257 | else | 1254 | else |
@@ -1280,14 +1277,7 @@ __xfs_get_blocks( | |||
1280 | (new || ISUNWRITTEN(&imap)))) | 1277 | (new || ISUNWRITTEN(&imap)))) |
1281 | set_buffer_new(bh_result); | 1278 | set_buffer_new(bh_result); |
1282 | 1279 | ||
1283 | if (imap.br_startblock == DELAYSTARTBLOCK) { | 1280 | BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); |
1284 | BUG_ON(direct); | ||
1285 | if (create) { | ||
1286 | set_buffer_uptodate(bh_result); | ||
1287 | set_buffer_mapped(bh_result); | ||
1288 | set_buffer_delay(bh_result); | ||
1289 | } | ||
1290 | } | ||
1291 | 1281 | ||
1292 | return 0; | 1282 | return 0; |
1293 | 1283 | ||
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault( | |||
1337 | * whereas if we have flags set we will always be called in task context | 1327 | * whereas if we have flags set we will always be called in task context |
1338 | * (i.e. from a workqueue). | 1328 | * (i.e. from a workqueue). |
1339 | */ | 1329 | */ |
1340 | STATIC int | 1330 | int |
1341 | xfs_end_io_direct_write( | 1331 | xfs_end_io_direct_write( |
1342 | struct kiocb *iocb, | 1332 | struct kiocb *iocb, |
1343 | loff_t offset, | 1333 | loff_t offset, |
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO( | |||
1408 | struct kiocb *iocb, | 1398 | struct kiocb *iocb, |
1409 | struct iov_iter *iter) | 1399 | struct iov_iter *iter) |
1410 | { | 1400 | { |
1411 | struct inode *inode = iocb->ki_filp->f_mapping->host; | ||
1412 | dio_iodone_t *endio = NULL; | ||
1413 | int flags = 0; | ||
1414 | struct block_device *bdev; | ||
1415 | |||
1416 | if (iov_iter_rw(iter) == WRITE) { | ||
1417 | endio = xfs_end_io_direct_write; | ||
1418 | flags = DIO_ASYNC_EXTEND; | ||
1419 | } | ||
1420 | |||
1421 | if (IS_DAX(inode)) { | ||
1422 | return dax_do_io(iocb, inode, iter, | ||
1423 | xfs_get_blocks_direct, endio, 0); | ||
1424 | } | ||
1425 | |||
1426 | bdev = xfs_find_bdev_for_inode(inode); | ||
1427 | return __blockdev_direct_IO(iocb, inode, bdev, iter, | ||
1428 | xfs_get_blocks_direct, endio, NULL, flags); | ||
1429 | } | ||
1430 | |||
1431 | /* | ||
1432 | * Punch out the delalloc blocks we have already allocated. | ||
1433 | * | ||
1434 | * Don't bother with xfs_setattr given that nothing can have made it to disk yet | ||
1435 | * as the page is still locked at this point. | ||
1436 | */ | ||
1437 | STATIC void | ||
1438 | xfs_vm_kill_delalloc_range( | ||
1439 | struct inode *inode, | ||
1440 | loff_t start, | ||
1441 | loff_t end) | ||
1442 | { | ||
1443 | struct xfs_inode *ip = XFS_I(inode); | ||
1444 | xfs_fileoff_t start_fsb; | ||
1445 | xfs_fileoff_t end_fsb; | ||
1446 | int error; | ||
1447 | |||
1448 | start_fsb = XFS_B_TO_FSB(ip->i_mount, start); | ||
1449 | end_fsb = XFS_B_TO_FSB(ip->i_mount, end); | ||
1450 | if (end_fsb <= start_fsb) | ||
1451 | return; | ||
1452 | |||
1453 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
1454 | error = xfs_bmap_punch_delalloc_range(ip, start_fsb, | ||
1455 | end_fsb - start_fsb); | ||
1456 | if (error) { | ||
1457 | /* something screwed, just bail */ | ||
1458 | if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { | ||
1459 | xfs_alert(ip->i_mount, | ||
1460 | "xfs_vm_write_failed: unable to clean up ino %lld", | ||
1461 | ip->i_ino); | ||
1462 | } | ||
1463 | } | ||
1464 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
1465 | } | ||
1466 | |||
1467 | STATIC void | ||
1468 | xfs_vm_write_failed( | ||
1469 | struct inode *inode, | ||
1470 | struct page *page, | ||
1471 | loff_t pos, | ||
1472 | unsigned len) | ||
1473 | { | ||
1474 | loff_t block_offset; | ||
1475 | loff_t block_start; | ||
1476 | loff_t block_end; | ||
1477 | loff_t from = pos & (PAGE_SIZE - 1); | ||
1478 | loff_t to = from + len; | ||
1479 | struct buffer_head *bh, *head; | ||
1480 | struct xfs_mount *mp = XFS_I(inode)->i_mount; | ||
1481 | |||
1482 | /* | 1401 | /* |
1483 | * The request pos offset might be 32 or 64 bit, this is all fine | 1402 | * We just need the method present so that open/fcntl allow direct I/O. |
1484 | * on 64-bit platform. However, for 64-bit pos request on 32-bit | ||
1485 | * platform, the high 32-bit will be masked off if we evaluate the | ||
1486 | * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is | ||
1487 | * 0xfffff000 as an unsigned long, hence the result is incorrect | ||
1488 | * which could cause the following ASSERT failed in most cases. | ||
1489 | * In order to avoid this, we can evaluate the block_offset of the | ||
1490 | * start of the page by using shifts rather than masks the mismatch | ||
1491 | * problem. | ||
1492 | */ | 1403 | */ |
1493 | block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; | 1404 | return -EINVAL; |
1494 | |||
1495 | ASSERT(block_offset + from == pos); | ||
1496 | |||
1497 | head = page_buffers(page); | ||
1498 | block_start = 0; | ||
1499 | for (bh = head; bh != head || !block_start; | ||
1500 | bh = bh->b_this_page, block_start = block_end, | ||
1501 | block_offset += bh->b_size) { | ||
1502 | block_end = block_start + bh->b_size; | ||
1503 | |||
1504 | /* skip buffers before the write */ | ||
1505 | if (block_end <= from) | ||
1506 | continue; | ||
1507 | |||
1508 | /* if the buffer is after the write, we're done */ | ||
1509 | if (block_start >= to) | ||
1510 | break; | ||
1511 | |||
1512 | /* | ||
1513 | * Process delalloc and unwritten buffers beyond EOF. We can | ||
1514 | * encounter unwritten buffers in the event that a file has | ||
1515 | * post-EOF unwritten extents and an extending write happens to | ||
1516 | * fail (e.g., an unaligned write that also involves a delalloc | ||
1517 | * to the same page). | ||
1518 | */ | ||
1519 | if (!buffer_delay(bh) && !buffer_unwritten(bh)) | ||
1520 | continue; | ||
1521 | |||
1522 | if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && | ||
1523 | block_offset < i_size_read(inode)) | ||
1524 | continue; | ||
1525 | |||
1526 | if (buffer_delay(bh)) | ||
1527 | xfs_vm_kill_delalloc_range(inode, block_offset, | ||
1528 | block_offset + bh->b_size); | ||
1529 | |||
1530 | /* | ||
1531 | * This buffer does not contain data anymore. make sure anyone | ||
1532 | * who finds it knows that for certain. | ||
1533 | */ | ||
1534 | clear_buffer_delay(bh); | ||
1535 | clear_buffer_uptodate(bh); | ||
1536 | clear_buffer_mapped(bh); | ||
1537 | clear_buffer_new(bh); | ||
1538 | clear_buffer_dirty(bh); | ||
1539 | clear_buffer_unwritten(bh); | ||
1540 | } | ||
1541 | |||
1542 | } | ||
1543 | |||
1544 | /* | ||
1545 | * This used to call block_write_begin(), but it unlocks and releases the page | ||
1546 | * on error, and we need that page to be able to punch stale delalloc blocks out | ||
1547 | * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at | ||
1548 | * the appropriate point. | ||
1549 | */ | ||
1550 | STATIC int | ||
1551 | xfs_vm_write_begin( | ||
1552 | struct file *file, | ||
1553 | struct address_space *mapping, | ||
1554 | loff_t pos, | ||
1555 | unsigned len, | ||
1556 | unsigned flags, | ||
1557 | struct page **pagep, | ||
1558 | void **fsdata) | ||
1559 | { | ||
1560 | pgoff_t index = pos >> PAGE_SHIFT; | ||
1561 | struct page *page; | ||
1562 | int status; | ||
1563 | struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; | ||
1564 | |||
1565 | ASSERT(len <= PAGE_SIZE); | ||
1566 | |||
1567 | page = grab_cache_page_write_begin(mapping, index, flags); | ||
1568 | if (!page) | ||
1569 | return -ENOMEM; | ||
1570 | |||
1571 | status = __block_write_begin(page, pos, len, xfs_get_blocks); | ||
1572 | if (xfs_mp_fail_writes(mp)) | ||
1573 | status = -EIO; | ||
1574 | if (unlikely(status)) { | ||
1575 | struct inode *inode = mapping->host; | ||
1576 | size_t isize = i_size_read(inode); | ||
1577 | |||
1578 | xfs_vm_write_failed(inode, page, pos, len); | ||
1579 | unlock_page(page); | ||
1580 | |||
1581 | /* | ||
1582 | * If the write is beyond EOF, we only want to kill blocks | ||
1583 | * allocated in this write, not blocks that were previously | ||
1584 | * written successfully. | ||
1585 | */ | ||
1586 | if (xfs_mp_fail_writes(mp)) | ||
1587 | isize = 0; | ||
1588 | if (pos + len > isize) { | ||
1589 | ssize_t start = max_t(ssize_t, pos, isize); | ||
1590 | |||
1591 | truncate_pagecache_range(inode, start, pos + len); | ||
1592 | } | ||
1593 | |||
1594 | put_page(page); | ||
1595 | page = NULL; | ||
1596 | } | ||
1597 | |||
1598 | *pagep = page; | ||
1599 | return status; | ||
1600 | } | ||
1601 | |||
1602 | /* | ||
1603 | * On failure, we only need to kill delalloc blocks beyond EOF in the range of | ||
1604 | * this specific write because they will never be written. Previous writes | ||
1605 | * beyond EOF where block allocation succeeded do not need to be trashed, so | ||
1606 | * only new blocks from this write should be trashed. For blocks within | ||
1607 | * EOF, generic_write_end() zeros them so they are safe to leave alone and be | ||
1608 | * written with all the other valid data. | ||
1609 | */ | ||
1610 | STATIC int | ||
1611 | xfs_vm_write_end( | ||
1612 | struct file *file, | ||
1613 | struct address_space *mapping, | ||
1614 | loff_t pos, | ||
1615 | unsigned len, | ||
1616 | unsigned copied, | ||
1617 | struct page *page, | ||
1618 | void *fsdata) | ||
1619 | { | ||
1620 | int ret; | ||
1621 | |||
1622 | ASSERT(len <= PAGE_SIZE); | ||
1623 | |||
1624 | ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); | ||
1625 | if (unlikely(ret < len)) { | ||
1626 | struct inode *inode = mapping->host; | ||
1627 | size_t isize = i_size_read(inode); | ||
1628 | loff_t to = pos + len; | ||
1629 | |||
1630 | if (to > isize) { | ||
1631 | /* only kill blocks in this write beyond EOF */ | ||
1632 | if (pos > isize) | ||
1633 | isize = pos; | ||
1634 | xfs_vm_kill_delalloc_range(inode, isize, to); | ||
1635 | truncate_pagecache_range(inode, isize, to); | ||
1636 | } | ||
1637 | } | ||
1638 | return ret; | ||
1639 | } | 1405 | } |
1640 | 1406 | ||
1641 | STATIC sector_t | 1407 | STATIC sector_t |
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = { | |||
1748 | .set_page_dirty = xfs_vm_set_page_dirty, | 1514 | .set_page_dirty = xfs_vm_set_page_dirty, |
1749 | .releasepage = xfs_vm_releasepage, | 1515 | .releasepage = xfs_vm_releasepage, |
1750 | .invalidatepage = xfs_vm_invalidatepage, | 1516 | .invalidatepage = xfs_vm_invalidatepage, |
1751 | .write_begin = xfs_vm_write_begin, | ||
1752 | .write_end = xfs_vm_write_end, | ||
1753 | .bmap = xfs_vm_bmap, | 1517 | .bmap = xfs_vm_bmap, |
1754 | .direct_IO = xfs_vm_direct_IO, | 1518 | .direct_IO = xfs_vm_direct_IO, |
1755 | .migratepage = buffer_migrate_page, | 1519 | .migratepage = buffer_migrate_page, |