diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 1751 |
1 files changed, 785 insertions, 966 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6382b89ecbd..0188e65e1f58 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, | |||
132 | new_size); | 132 | new_size); |
133 | } | 133 | } |
134 | 134 | ||
135 | static void ext4_invalidatepage(struct page *page, unsigned long offset); | 135 | static void ext4_invalidatepage(struct page *page, unsigned int offset, |
136 | unsigned int length); | ||
136 | static int __ext4_journalled_writepage(struct page *page, unsigned int len); | 137 | static int __ext4_journalled_writepage(struct page *page, unsigned int len); |
137 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); | 138 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); |
138 | static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, | 139 | static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, |
139 | struct inode *inode, struct page *page, loff_t from, | 140 | int pextents); |
140 | loff_t length, int flags); | ||
141 | 141 | ||
142 | /* | 142 | /* |
143 | * Test whether an inode is a fast symlink. | 143 | * Test whether an inode is a fast symlink. |
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode) | |||
215 | filemap_write_and_wait(&inode->i_data); | 215 | filemap_write_and_wait(&inode->i_data); |
216 | } | 216 | } |
217 | truncate_inode_pages(&inode->i_data, 0); | 217 | truncate_inode_pages(&inode->i_data, 0); |
218 | ext4_ioend_shutdown(inode); | 218 | |
219 | WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); | ||
219 | goto no_delete; | 220 | goto no_delete; |
220 | } | 221 | } |
221 | 222 | ||
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode) | |||
225 | if (ext4_should_order_data(inode)) | 226 | if (ext4_should_order_data(inode)) |
226 | ext4_begin_ordered_truncate(inode, 0); | 227 | ext4_begin_ordered_truncate(inode, 0); |
227 | truncate_inode_pages(&inode->i_data, 0); | 228 | truncate_inode_pages(&inode->i_data, 0); |
228 | ext4_ioend_shutdown(inode); | ||
229 | 229 | ||
230 | WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); | ||
230 | if (is_bad_inode(inode)) | 231 | if (is_bad_inode(inode)) |
231 | goto no_delete; | 232 | goto no_delete; |
232 | 233 | ||
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func, | |||
423 | #define check_block_validity(inode, map) \ | 424 | #define check_block_validity(inode, map) \ |
424 | __check_block_validity((inode), __func__, __LINE__, (map)) | 425 | __check_block_validity((inode), __func__, __LINE__, (map)) |
425 | 426 | ||
426 | /* | ||
427 | * Return the number of contiguous dirty pages in a given inode | ||
428 | * starting at page frame idx. | ||
429 | */ | ||
430 | static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, | ||
431 | unsigned int max_pages) | ||
432 | { | ||
433 | struct address_space *mapping = inode->i_mapping; | ||
434 | pgoff_t index; | ||
435 | struct pagevec pvec; | ||
436 | pgoff_t num = 0; | ||
437 | int i, nr_pages, done = 0; | ||
438 | |||
439 | if (max_pages == 0) | ||
440 | return 0; | ||
441 | pagevec_init(&pvec, 0); | ||
442 | while (!done) { | ||
443 | index = idx; | ||
444 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
445 | PAGECACHE_TAG_DIRTY, | ||
446 | (pgoff_t)PAGEVEC_SIZE); | ||
447 | if (nr_pages == 0) | ||
448 | break; | ||
449 | for (i = 0; i < nr_pages; i++) { | ||
450 | struct page *page = pvec.pages[i]; | ||
451 | struct buffer_head *bh, *head; | ||
452 | |||
453 | lock_page(page); | ||
454 | if (unlikely(page->mapping != mapping) || | ||
455 | !PageDirty(page) || | ||
456 | PageWriteback(page) || | ||
457 | page->index != idx) { | ||
458 | done = 1; | ||
459 | unlock_page(page); | ||
460 | break; | ||
461 | } | ||
462 | if (page_has_buffers(page)) { | ||
463 | bh = head = page_buffers(page); | ||
464 | do { | ||
465 | if (!buffer_delay(bh) && | ||
466 | !buffer_unwritten(bh)) | ||
467 | done = 1; | ||
468 | bh = bh->b_this_page; | ||
469 | } while (!done && (bh != head)); | ||
470 | } | ||
471 | unlock_page(page); | ||
472 | if (done) | ||
473 | break; | ||
474 | idx++; | ||
475 | num++; | ||
476 | if (num >= max_pages) { | ||
477 | done = 1; | ||
478 | break; | ||
479 | } | ||
480 | } | ||
481 | pagevec_release(&pvec); | ||
482 | } | ||
483 | return num; | ||
484 | } | ||
485 | |||
486 | #ifdef ES_AGGRESSIVE_TEST | 427 | #ifdef ES_AGGRESSIVE_TEST |
487 | static void ext4_map_blocks_es_recheck(handle_t *handle, | 428 | static void ext4_map_blocks_es_recheck(handle_t *handle, |
488 | struct inode *inode, | 429 | struct inode *inode, |
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, | |||
573 | "logical block %lu\n", inode->i_ino, flags, map->m_len, | 514 | "logical block %lu\n", inode->i_ino, flags, map->m_len, |
574 | (unsigned long) map->m_lblk); | 515 | (unsigned long) map->m_lblk); |
575 | 516 | ||
517 | ext4_es_lru_add(inode); | ||
518 | |||
576 | /* Lookup extent status tree firstly */ | 519 | /* Lookup extent status tree firstly */ |
577 | if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { | 520 | if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { |
578 | if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { | 521 | if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { |
@@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file, | |||
1118 | } | 1061 | } |
1119 | } | 1062 | } |
1120 | 1063 | ||
1121 | if (ext4_has_inline_data(inode)) | 1064 | if (ext4_has_inline_data(inode)) { |
1122 | copied = ext4_write_inline_data_end(inode, pos, len, | 1065 | ret = ext4_write_inline_data_end(inode, pos, len, |
1123 | copied, page); | 1066 | copied, page); |
1124 | else | 1067 | if (ret < 0) |
1068 | goto errout; | ||
1069 | copied = ret; | ||
1070 | } else | ||
1125 | copied = block_write_end(file, mapping, pos, | 1071 | copied = block_write_end(file, mapping, pos, |
1126 | len, copied, page, fsdata); | 1072 | len, copied, page, fsdata); |
1127 | 1073 | ||
@@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file, | |||
1157 | if (i_size_changed) | 1103 | if (i_size_changed) |
1158 | ext4_mark_inode_dirty(handle, inode); | 1104 | ext4_mark_inode_dirty(handle, inode); |
1159 | 1105 | ||
1160 | if (copied < 0) | ||
1161 | ret = copied; | ||
1162 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) | 1106 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1163 | /* if we have allocated more blocks and copied | 1107 | /* if we have allocated more blocks and copied |
1164 | * less. We will have blocks allocated outside | 1108 | * less. We will have blocks allocated outside |
@@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free) | |||
1415 | } | 1359 | } |
1416 | 1360 | ||
1417 | static void ext4_da_page_release_reservation(struct page *page, | 1361 | static void ext4_da_page_release_reservation(struct page *page, |
1418 | unsigned long offset) | 1362 | unsigned int offset, |
1363 | unsigned int length) | ||
1419 | { | 1364 | { |
1420 | int to_release = 0; | 1365 | int to_release = 0; |
1421 | struct buffer_head *head, *bh; | 1366 | struct buffer_head *head, *bh; |
1422 | unsigned int curr_off = 0; | 1367 | unsigned int curr_off = 0; |
1423 | struct inode *inode = page->mapping->host; | 1368 | struct inode *inode = page->mapping->host; |
1424 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1369 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1370 | unsigned int stop = offset + length; | ||
1425 | int num_clusters; | 1371 | int num_clusters; |
1426 | ext4_fsblk_t lblk; | 1372 | ext4_fsblk_t lblk; |
1427 | 1373 | ||
1374 | BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); | ||
1375 | |||
1428 | head = page_buffers(page); | 1376 | head = page_buffers(page); |
1429 | bh = head; | 1377 | bh = head; |
1430 | do { | 1378 | do { |
1431 | unsigned int next_off = curr_off + bh->b_size; | 1379 | unsigned int next_off = curr_off + bh->b_size; |
1432 | 1380 | ||
1381 | if (next_off > stop) | ||
1382 | break; | ||
1383 | |||
1433 | if ((offset <= curr_off) && (buffer_delay(bh))) { | 1384 | if ((offset <= curr_off) && (buffer_delay(bh))) { |
1434 | to_release++; | 1385 | to_release++; |
1435 | clear_buffer_delay(bh); | 1386 | clear_buffer_delay(bh); |
@@ -1460,140 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
1460 | * Delayed allocation stuff | 1411 | * Delayed allocation stuff |
1461 | */ | 1412 | */ |
1462 | 1413 | ||
1463 | /* | 1414 | struct mpage_da_data { |
1464 | * mpage_da_submit_io - walks through extent of pages and try to write | 1415 | struct inode *inode; |
1465 | * them with writepage() call back | 1416 | struct writeback_control *wbc; |
1466 | * | ||
1467 | * @mpd->inode: inode | ||
1468 | * @mpd->first_page: first page of the extent | ||
1469 | * @mpd->next_page: page after the last page of the extent | ||
1470 | * | ||
1471 | * By the time mpage_da_submit_io() is called we expect all blocks | ||
1472 | * to be allocated. this may be wrong if allocation failed. | ||
1473 | * | ||
1474 | * As pages are already locked by write_cache_pages(), we can't use it | ||
1475 | */ | ||
1476 | static int mpage_da_submit_io(struct mpage_da_data *mpd, | ||
1477 | struct ext4_map_blocks *map) | ||
1478 | { | ||
1479 | struct pagevec pvec; | ||
1480 | unsigned long index, end; | ||
1481 | int ret = 0, err, nr_pages, i; | ||
1482 | struct inode *inode = mpd->inode; | ||
1483 | struct address_space *mapping = inode->i_mapping; | ||
1484 | loff_t size = i_size_read(inode); | ||
1485 | unsigned int len, block_start; | ||
1486 | struct buffer_head *bh, *page_bufs = NULL; | ||
1487 | sector_t pblock = 0, cur_logical = 0; | ||
1488 | struct ext4_io_submit io_submit; | ||
1489 | 1417 | ||
1490 | BUG_ON(mpd->next_page <= mpd->first_page); | 1418 | pgoff_t first_page; /* The first page to write */ |
1491 | memset(&io_submit, 0, sizeof(io_submit)); | 1419 | pgoff_t next_page; /* Current page to examine */ |
1420 | pgoff_t last_page; /* Last page to examine */ | ||
1492 | /* | 1421 | /* |
1493 | * We need to start from the first_page to the next_page - 1 | 1422 | * Extent to map - this can be after first_page because that can be |
1494 | * to make sure we also write the mapped dirty buffer_heads. | 1423 | * fully mapped. We somewhat abuse m_flags to store whether the extent |
1495 | * If we look at mpd->b_blocknr we would only be looking | 1424 | * is delalloc or unwritten. |
1496 | * at the currently mapped buffer_heads. | ||
1497 | */ | 1425 | */ |
1498 | index = mpd->first_page; | 1426 | struct ext4_map_blocks map; |
1499 | end = mpd->next_page - 1; | 1427 | struct ext4_io_submit io_submit; /* IO submission data */ |
1500 | 1428 | }; | |
1501 | pagevec_init(&pvec, 0); | ||
1502 | while (index <= end) { | ||
1503 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | ||
1504 | if (nr_pages == 0) | ||
1505 | break; | ||
1506 | for (i = 0; i < nr_pages; i++) { | ||
1507 | int skip_page = 0; | ||
1508 | struct page *page = pvec.pages[i]; | ||
1509 | |||
1510 | index = page->index; | ||
1511 | if (index > end) | ||
1512 | break; | ||
1513 | |||
1514 | if (index == size >> PAGE_CACHE_SHIFT) | ||
1515 | len = size & ~PAGE_CACHE_MASK; | ||
1516 | else | ||
1517 | len = PAGE_CACHE_SIZE; | ||
1518 | if (map) { | ||
1519 | cur_logical = index << (PAGE_CACHE_SHIFT - | ||
1520 | inode->i_blkbits); | ||
1521 | pblock = map->m_pblk + (cur_logical - | ||
1522 | map->m_lblk); | ||
1523 | } | ||
1524 | index++; | ||
1525 | |||
1526 | BUG_ON(!PageLocked(page)); | ||
1527 | BUG_ON(PageWriteback(page)); | ||
1528 | |||
1529 | bh = page_bufs = page_buffers(page); | ||
1530 | block_start = 0; | ||
1531 | do { | ||
1532 | if (map && (cur_logical >= map->m_lblk) && | ||
1533 | (cur_logical <= (map->m_lblk + | ||
1534 | (map->m_len - 1)))) { | ||
1535 | if (buffer_delay(bh)) { | ||
1536 | clear_buffer_delay(bh); | ||
1537 | bh->b_blocknr = pblock; | ||
1538 | } | ||
1539 | if (buffer_unwritten(bh) || | ||
1540 | buffer_mapped(bh)) | ||
1541 | BUG_ON(bh->b_blocknr != pblock); | ||
1542 | if (map->m_flags & EXT4_MAP_UNINIT) | ||
1543 | set_buffer_uninit(bh); | ||
1544 | clear_buffer_unwritten(bh); | ||
1545 | } | ||
1546 | |||
1547 | /* | ||
1548 | * skip page if block allocation undone and | ||
1549 | * block is dirty | ||
1550 | */ | ||
1551 | if (ext4_bh_delay_or_unwritten(NULL, bh)) | ||
1552 | skip_page = 1; | ||
1553 | bh = bh->b_this_page; | ||
1554 | block_start += bh->b_size; | ||
1555 | cur_logical++; | ||
1556 | pblock++; | ||
1557 | } while (bh != page_bufs); | ||
1558 | |||
1559 | if (skip_page) { | ||
1560 | unlock_page(page); | ||
1561 | continue; | ||
1562 | } | ||
1563 | |||
1564 | clear_page_dirty_for_io(page); | ||
1565 | err = ext4_bio_write_page(&io_submit, page, len, | ||
1566 | mpd->wbc); | ||
1567 | if (!err) | ||
1568 | mpd->pages_written++; | ||
1569 | /* | ||
1570 | * In error case, we have to continue because | ||
1571 | * remaining pages are still locked | ||
1572 | */ | ||
1573 | if (ret == 0) | ||
1574 | ret = err; | ||
1575 | } | ||
1576 | pagevec_release(&pvec); | ||
1577 | } | ||
1578 | ext4_io_submit(&io_submit); | ||
1579 | return ret; | ||
1580 | } | ||
1581 | 1429 | ||
1582 | static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) | 1430 | static void mpage_release_unused_pages(struct mpage_da_data *mpd, |
1431 | bool invalidate) | ||
1583 | { | 1432 | { |
1584 | int nr_pages, i; | 1433 | int nr_pages, i; |
1585 | pgoff_t index, end; | 1434 | pgoff_t index, end; |
1586 | struct pagevec pvec; | 1435 | struct pagevec pvec; |
1587 | struct inode *inode = mpd->inode; | 1436 | struct inode *inode = mpd->inode; |
1588 | struct address_space *mapping = inode->i_mapping; | 1437 | struct address_space *mapping = inode->i_mapping; |
1589 | ext4_lblk_t start, last; | 1438 | |
1439 | /* This is necessary when next_page == 0. */ | ||
1440 | if (mpd->first_page >= mpd->next_page) | ||
1441 | return; | ||
1590 | 1442 | ||
1591 | index = mpd->first_page; | 1443 | index = mpd->first_page; |
1592 | end = mpd->next_page - 1; | 1444 | end = mpd->next_page - 1; |
1593 | 1445 | if (invalidate) { | |
1594 | start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 1446 | ext4_lblk_t start, last; |
1595 | last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 1447 | start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
1596 | ext4_es_remove_extent(inode, start, last - start + 1); | 1448 | last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
1449 | ext4_es_remove_extent(inode, start, last - start + 1); | ||
1450 | } | ||
1597 | 1451 | ||
1598 | pagevec_init(&pvec, 0); | 1452 | pagevec_init(&pvec, 0); |
1599 | while (index <= end) { | 1453 | while (index <= end) { |
@@ -1606,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) | |||
1606 | break; | 1460 | break; |
1607 | BUG_ON(!PageLocked(page)); | 1461 | BUG_ON(!PageLocked(page)); |
1608 | BUG_ON(PageWriteback(page)); | 1462 | BUG_ON(PageWriteback(page)); |
1609 | block_invalidatepage(page, 0); | 1463 | if (invalidate) { |
1610 | ClearPageUptodate(page); | 1464 | block_invalidatepage(page, 0, PAGE_CACHE_SIZE); |
1465 | ClearPageUptodate(page); | ||
1466 | } | ||
1611 | unlock_page(page); | 1467 | unlock_page(page); |
1612 | } | 1468 | } |
1613 | index = pvec.pages[nr_pages - 1]->index + 1; | 1469 | index = pvec.pages[nr_pages - 1]->index + 1; |
1614 | pagevec_release(&pvec); | 1470 | pagevec_release(&pvec); |
1615 | } | 1471 | } |
1616 | return; | ||
1617 | } | 1472 | } |
1618 | 1473 | ||
1619 | static void ext4_print_free_blocks(struct inode *inode) | 1474 | static void ext4_print_free_blocks(struct inode *inode) |
@@ -1642,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode) | |||
1642 | return; | 1497 | return; |
1643 | } | 1498 | } |
1644 | 1499 | ||
1645 | /* | ||
1646 | * mpage_da_map_and_submit - go through given space, map them | ||
1647 | * if necessary, and then submit them for I/O | ||
1648 | * | ||
1649 | * @mpd - bh describing space | ||
1650 | * | ||
1651 | * The function skips space we know is already mapped to disk blocks. | ||
1652 | * | ||
1653 | */ | ||
1654 | static void mpage_da_map_and_submit(struct mpage_da_data *mpd) | ||
1655 | { | ||
1656 | int err, blks, get_blocks_flags; | ||
1657 | struct ext4_map_blocks map, *mapp = NULL; | ||
1658 | sector_t next = mpd->b_blocknr; | ||
1659 | unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; | ||
1660 | loff_t disksize = EXT4_I(mpd->inode)->i_disksize; | ||
1661 | handle_t *handle = NULL; | ||
1662 | |||
1663 | /* | ||
1664 | * If the blocks are mapped already, or we couldn't accumulate | ||
1665 | * any blocks, then proceed immediately to the submission stage. | ||
1666 | */ | ||
1667 | if ((mpd->b_size == 0) || | ||
1668 | ((mpd->b_state & (1 << BH_Mapped)) && | ||
1669 | !(mpd->b_state & (1 << BH_Delay)) && | ||
1670 | !(mpd->b_state & (1 << BH_Unwritten)))) | ||
1671 | goto submit_io; | ||
1672 | |||
1673 | handle = ext4_journal_current_handle(); | ||
1674 | BUG_ON(!handle); | ||
1675 | |||
1676 | /* | ||
1677 | * Call ext4_map_blocks() to allocate any delayed allocation | ||
1678 | * blocks, or to convert an uninitialized extent to be | ||
1679 | * initialized (in the case where we have written into | ||
1680 | * one or more preallocated blocks). | ||
1681 | * | ||
1682 | * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to | ||
1683 | * indicate that we are on the delayed allocation path. This | ||
1684 | * affects functions in many different parts of the allocation | ||
1685 | * call path. This flag exists primarily because we don't | ||
1686 | * want to change *many* call functions, so ext4_map_blocks() | ||
1687 | * will set the EXT4_STATE_DELALLOC_RESERVED flag once the | ||
1688 | * inode's allocation semaphore is taken. | ||
1689 | * | ||
1690 | * If the blocks in questions were delalloc blocks, set | ||
1691 | * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting | ||
1692 | * variables are updated after the blocks have been allocated. | ||
1693 | */ | ||
1694 | map.m_lblk = next; | ||
1695 | map.m_len = max_blocks; | ||
1696 | /* | ||
1697 | * We're in delalloc path and it is possible that we're going to | ||
1698 | * need more metadata blocks than previously reserved. However | ||
1699 | * we must not fail because we're in writeback and there is | ||
1700 | * nothing we can do about it so it might result in data loss. | ||
1701 | * So use reserved blocks to allocate metadata if possible. | ||
1702 | */ | ||
1703 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE | | ||
1704 | EXT4_GET_BLOCKS_METADATA_NOFAIL; | ||
1705 | if (ext4_should_dioread_nolock(mpd->inode)) | ||
1706 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; | ||
1707 | if (mpd->b_state & (1 << BH_Delay)) | ||
1708 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; | ||
1709 | |||
1710 | |||
1711 | blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); | ||
1712 | if (blks < 0) { | ||
1713 | struct super_block *sb = mpd->inode->i_sb; | ||
1714 | |||
1715 | err = blks; | ||
1716 | /* | ||
1717 | * If get block returns EAGAIN or ENOSPC and there | ||
1718 | * appears to be free blocks we will just let | ||
1719 | * mpage_da_submit_io() unlock all of the pages. | ||
1720 | */ | ||
1721 | if (err == -EAGAIN) | ||
1722 | goto submit_io; | ||
1723 | |||
1724 | if (err == -ENOSPC && ext4_count_free_clusters(sb)) { | ||
1725 | mpd->retval = err; | ||
1726 | goto submit_io; | ||
1727 | } | ||
1728 | |||
1729 | /* | ||
1730 | * get block failure will cause us to loop in | ||
1731 | * writepages, because a_ops->writepage won't be able | ||
1732 | * to make progress. The page will be redirtied by | ||
1733 | * writepage and writepages will again try to write | ||
1734 | * the same. | ||
1735 | */ | ||
1736 | if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { | ||
1737 | ext4_msg(sb, KERN_CRIT, | ||
1738 | "delayed block allocation failed for inode %lu " | ||
1739 | "at logical offset %llu with max blocks %zd " | ||
1740 | "with error %d", mpd->inode->i_ino, | ||
1741 | (unsigned long long) next, | ||
1742 | mpd->b_size >> mpd->inode->i_blkbits, err); | ||
1743 | ext4_msg(sb, KERN_CRIT, | ||
1744 | "This should not happen!! Data will be lost"); | ||
1745 | if (err == -ENOSPC) | ||
1746 | ext4_print_free_blocks(mpd->inode); | ||
1747 | } | ||
1748 | /* invalidate all the pages */ | ||
1749 | ext4_da_block_invalidatepages(mpd); | ||
1750 | |||
1751 | /* Mark this page range as having been completed */ | ||
1752 | mpd->io_done = 1; | ||
1753 | return; | ||
1754 | } | ||
1755 | BUG_ON(blks == 0); | ||
1756 | |||
1757 | mapp = ↦ | ||
1758 | if (map.m_flags & EXT4_MAP_NEW) { | ||
1759 | struct block_device *bdev = mpd->inode->i_sb->s_bdev; | ||
1760 | int i; | ||
1761 | |||
1762 | for (i = 0; i < map.m_len; i++) | ||
1763 | unmap_underlying_metadata(bdev, map.m_pblk + i); | ||
1764 | } | ||
1765 | |||
1766 | /* | ||
1767 | * Update on-disk size along with block allocation. | ||
1768 | */ | ||
1769 | disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; | ||
1770 | if (disksize > i_size_read(mpd->inode)) | ||
1771 | disksize = i_size_read(mpd->inode); | ||
1772 | if (disksize > EXT4_I(mpd->inode)->i_disksize) { | ||
1773 | ext4_update_i_disksize(mpd->inode, disksize); | ||
1774 | err = ext4_mark_inode_dirty(handle, mpd->inode); | ||
1775 | if (err) | ||
1776 | ext4_error(mpd->inode->i_sb, | ||
1777 | "Failed to mark inode %lu dirty", | ||
1778 | mpd->inode->i_ino); | ||
1779 | } | ||
1780 | |||
1781 | submit_io: | ||
1782 | mpage_da_submit_io(mpd, mapp); | ||
1783 | mpd->io_done = 1; | ||
1784 | } | ||
1785 | |||
1786 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ | ||
1787 | (1 << BH_Delay) | (1 << BH_Unwritten)) | ||
1788 | |||
1789 | /* | ||
1790 | * mpage_add_bh_to_extent - try to add one more block to extent of blocks | ||
1791 | * | ||
1792 | * @mpd->lbh - extent of blocks | ||
1793 | * @logical - logical number of the block in the file | ||
1794 | * @b_state - b_state of the buffer head added | ||
1795 | * | ||
1796 | * the function is used to collect contig. blocks in same state | ||
1797 | */ | ||
1798 | static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, | ||
1799 | unsigned long b_state) | ||
1800 | { | ||
1801 | sector_t next; | ||
1802 | int blkbits = mpd->inode->i_blkbits; | ||
1803 | int nrblocks = mpd->b_size >> blkbits; | ||
1804 | |||
1805 | /* | ||
1806 | * XXX Don't go larger than mballoc is willing to allocate | ||
1807 | * This is a stopgap solution. We eventually need to fold | ||
1808 | * mpage_da_submit_io() into this function and then call | ||
1809 | * ext4_map_blocks() multiple times in a loop | ||
1810 | */ | ||
1811 | if (nrblocks >= (8*1024*1024 >> blkbits)) | ||
1812 | goto flush_it; | ||
1813 | |||
1814 | /* check if the reserved journal credits might overflow */ | ||
1815 | if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { | ||
1816 | if (nrblocks >= EXT4_MAX_TRANS_DATA) { | ||
1817 | /* | ||
1818 | * With non-extent format we are limited by the journal | ||
1819 | * credit available. Total credit needed to insert | ||
1820 | * nrblocks contiguous blocks is dependent on the | ||
1821 | * nrblocks. So limit nrblocks. | ||
1822 | */ | ||
1823 | goto flush_it; | ||
1824 | } | ||
1825 | } | ||
1826 | /* | ||
1827 | * First block in the extent | ||
1828 | */ | ||
1829 | if (mpd->b_size == 0) { | ||
1830 | mpd->b_blocknr = logical; | ||
1831 | mpd->b_size = 1 << blkbits; | ||
1832 | mpd->b_state = b_state & BH_FLAGS; | ||
1833 | return; | ||
1834 | } | ||
1835 | |||
1836 | next = mpd->b_blocknr + nrblocks; | ||
1837 | /* | ||
1838 | * Can we merge the block to our big extent? | ||
1839 | */ | ||
1840 | if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { | ||
1841 | mpd->b_size += 1 << blkbits; | ||
1842 | return; | ||
1843 | } | ||
1844 | |||
1845 | flush_it: | ||
1846 | /* | ||
1847 | * We couldn't merge the block to our extent, so we | ||
1848 | * need to flush current extent and start new one | ||
1849 | */ | ||
1850 | mpage_da_map_and_submit(mpd); | ||
1851 | return; | ||
1852 | } | ||
1853 | |||
1854 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) | 1500 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) |
1855 | { | 1501 | { |
1856 | return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); | 1502 | return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); |
@@ -1883,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, | |||
1883 | "logical block %lu\n", inode->i_ino, map->m_len, | 1529 | "logical block %lu\n", inode->i_ino, map->m_len, |
1884 | (unsigned long) map->m_lblk); | 1530 | (unsigned long) map->m_lblk); |
1885 | 1531 | ||
1532 | ext4_es_lru_add(inode); | ||
1533 | |||
1886 | /* Lookup extent status tree firstly */ | 1534 | /* Lookup extent status tree firstly */ |
1887 | if (ext4_es_lookup_extent(inode, iblock, &es)) { | 1535 | if (ext4_es_lookup_extent(inode, iblock, &es)) { |
1888 | 1536 | ||
@@ -2156,7 +1804,7 @@ out: | |||
2156 | * lock so we have to do some magic. | 1804 | * lock so we have to do some magic. |
2157 | * | 1805 | * |
2158 | * This function can get called via... | 1806 | * This function can get called via... |
2159 | * - ext4_da_writepages after taking page lock (have journal handle) | 1807 | * - ext4_writepages after taking page lock (have journal handle) |
2160 | * - journal_submit_inode_data_buffers (no journal handle) | 1808 | * - journal_submit_inode_data_buffers (no journal handle) |
2161 | * - shrink_page_list via the kswapd/direct reclaim (no journal handle) | 1809 | * - shrink_page_list via the kswapd/direct reclaim (no journal handle) |
2162 | * - grab_page_cache when doing write_begin (have journal handle) | 1810 | * - grab_page_cache when doing write_begin (have journal handle) |
@@ -2234,76 +1882,405 @@ static int ext4_writepage(struct page *page, | |||
2234 | */ | 1882 | */ |
2235 | return __ext4_journalled_writepage(page, len); | 1883 | return __ext4_journalled_writepage(page, len); |
2236 | 1884 | ||
2237 | memset(&io_submit, 0, sizeof(io_submit)); | 1885 | ext4_io_submit_init(&io_submit, wbc); |
1886 | io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); | ||
1887 | if (!io_submit.io_end) { | ||
1888 | redirty_page_for_writepage(wbc, page); | ||
1889 | unlock_page(page); | ||
1890 | return -ENOMEM; | ||
1891 | } | ||
2238 | ret = ext4_bio_write_page(&io_submit, page, len, wbc); | 1892 | ret = ext4_bio_write_page(&io_submit, page, len, wbc); |
2239 | ext4_io_submit(&io_submit); | 1893 | ext4_io_submit(&io_submit); |
1894 | /* Drop io_end reference we got from init */ | ||
1895 | ext4_put_io_end_defer(io_submit.io_end); | ||
2240 | return ret; | 1896 | return ret; |
2241 | } | 1897 | } |
2242 | 1898 | ||
1899 | #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) | ||
1900 | |||
2243 | /* | 1901 | /* |
2244 | * This is called via ext4_da_writepages() to | 1902 | * mballoc gives us at most this number of blocks... |
2245 | * calculate the total number of credits to reserve to fit | 1903 | * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). |
2246 | * a single extent allocation into a single transaction, | 1904 | * The rest of mballoc seems to handle chunks upto full group size. |
2247 | * ext4_da_writpeages() will loop calling this before | ||
2248 | * the block allocation. | ||
2249 | */ | 1905 | */ |
1906 | #define MAX_WRITEPAGES_EXTENT_LEN 2048 | ||
2250 | 1907 | ||
2251 | static int ext4_da_writepages_trans_blocks(struct inode *inode) | 1908 | /* |
1909 | * mpage_add_bh_to_extent - try to add bh to extent of blocks to map | ||
1910 | * | ||
1911 | * @mpd - extent of blocks | ||
1912 | * @lblk - logical number of the block in the file | ||
1913 | * @b_state - b_state of the buffer head added | ||
1914 | * | ||
1915 | * the function is used to collect contig. blocks in same state | ||
1916 | */ | ||
1917 | static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, | ||
1918 | unsigned long b_state) | ||
1919 | { | ||
1920 | struct ext4_map_blocks *map = &mpd->map; | ||
1921 | |||
1922 | /* Don't go larger than mballoc is willing to allocate */ | ||
1923 | if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) | ||
1924 | return 0; | ||
1925 | |||
1926 | /* First block in the extent? */ | ||
1927 | if (map->m_len == 0) { | ||
1928 | map->m_lblk = lblk; | ||
1929 | map->m_len = 1; | ||
1930 | map->m_flags = b_state & BH_FLAGS; | ||
1931 | return 1; | ||
1932 | } | ||
1933 | |||
1934 | /* Can we merge the block to our big extent? */ | ||
1935 | if (lblk == map->m_lblk + map->m_len && | ||
1936 | (b_state & BH_FLAGS) == map->m_flags) { | ||
1937 | map->m_len++; | ||
1938 | return 1; | ||
1939 | } | ||
1940 | return 0; | ||
1941 | } | ||
1942 | |||
1943 | static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, | ||
1944 | struct buffer_head *head, | ||
1945 | struct buffer_head *bh, | ||
1946 | ext4_lblk_t lblk) | ||
1947 | { | ||
1948 | struct inode *inode = mpd->inode; | ||
1949 | ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) | ||
1950 | >> inode->i_blkbits; | ||
1951 | |||
1952 | do { | ||
1953 | BUG_ON(buffer_locked(bh)); | ||
1954 | |||
1955 | if (!buffer_dirty(bh) || !buffer_mapped(bh) || | ||
1956 | (!buffer_delay(bh) && !buffer_unwritten(bh)) || | ||
1957 | lblk >= blocks) { | ||
1958 | /* Found extent to map? */ | ||
1959 | if (mpd->map.m_len) | ||
1960 | return false; | ||
1961 | if (lblk >= blocks) | ||
1962 | return true; | ||
1963 | continue; | ||
1964 | } | ||
1965 | if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) | ||
1966 | return false; | ||
1967 | } while (lblk++, (bh = bh->b_this_page) != head); | ||
1968 | return true; | ||
1969 | } | ||
1970 | |||
1971 | static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) | ||
2252 | { | 1972 | { |
2253 | int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 1973 | int len; |
1974 | loff_t size = i_size_read(mpd->inode); | ||
1975 | int err; | ||
1976 | |||
1977 | BUG_ON(page->index != mpd->first_page); | ||
1978 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
1979 | len = size & ~PAGE_CACHE_MASK; | ||
1980 | else | ||
1981 | len = PAGE_CACHE_SIZE; | ||
1982 | clear_page_dirty_for_io(page); | ||
1983 | err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); | ||
1984 | if (!err) | ||
1985 | mpd->wbc->nr_to_write--; | ||
1986 | mpd->first_page++; | ||
2254 | 1987 | ||
1988 | return err; | ||
1989 | } | ||
1990 | |||
1991 | /* | ||
1992 | * mpage_map_buffers - update buffers corresponding to changed extent and | ||
1993 | * submit fully mapped pages for IO | ||
1994 | * | ||
1995 | * @mpd - description of extent to map, on return next extent to map | ||
1996 | * | ||
1997 | * Scan buffers corresponding to changed extent (we expect corresponding pages | ||
1998 | * to be already locked) and update buffer state according to new extent state. | ||
1999 | * We map delalloc buffers to their physical location, clear unwritten bits, | ||
2000 | * and mark buffers as uninit when we perform writes to uninitialized extents | ||
2001 | * and do extent conversion after IO is finished. If the last page is not fully | ||
2002 | * mapped, we update @map to the next extent in the last page that needs | ||
2003 | * mapping. Otherwise we submit the page for IO. | ||
2004 | */ | ||
2005 | static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) | ||
2006 | { | ||
2007 | struct pagevec pvec; | ||
2008 | int nr_pages, i; | ||
2009 | struct inode *inode = mpd->inode; | ||
2010 | struct buffer_head *head, *bh; | ||
2011 | int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; | ||
2012 | ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) | ||
2013 | >> inode->i_blkbits; | ||
2014 | pgoff_t start, end; | ||
2015 | ext4_lblk_t lblk; | ||
2016 | sector_t pblock; | ||
2017 | int err; | ||
2018 | |||
2019 | start = mpd->map.m_lblk >> bpp_bits; | ||
2020 | end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; | ||
2021 | lblk = start << bpp_bits; | ||
2022 | pblock = mpd->map.m_pblk; | ||
2023 | |||
2024 | pagevec_init(&pvec, 0); | ||
2025 | while (start <= end) { | ||
2026 | nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, | ||
2027 | PAGEVEC_SIZE); | ||
2028 | if (nr_pages == 0) | ||
2029 | break; | ||
2030 | for (i = 0; i < nr_pages; i++) { | ||
2031 | struct page *page = pvec.pages[i]; | ||
2032 | |||
2033 | if (page->index > end) | ||
2034 | break; | ||
2035 | /* Upto 'end' pages must be contiguous */ | ||
2036 | BUG_ON(page->index != start); | ||
2037 | bh = head = page_buffers(page); | ||
2038 | do { | ||
2039 | if (lblk < mpd->map.m_lblk) | ||
2040 | continue; | ||
2041 | if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { | ||
2042 | /* | ||
2043 | * Buffer after end of mapped extent. | ||
2044 | * Find next buffer in the page to map. | ||
2045 | */ | ||
2046 | mpd->map.m_len = 0; | ||
2047 | mpd->map.m_flags = 0; | ||
2048 | add_page_bufs_to_extent(mpd, head, bh, | ||
2049 | lblk); | ||
2050 | pagevec_release(&pvec); | ||
2051 | return 0; | ||
2052 | } | ||
2053 | if (buffer_delay(bh)) { | ||
2054 | clear_buffer_delay(bh); | ||
2055 | bh->b_blocknr = pblock++; | ||
2056 | } | ||
2057 | clear_buffer_unwritten(bh); | ||
2058 | } while (++lblk < blocks && | ||
2059 | (bh = bh->b_this_page) != head); | ||
2060 | |||
2061 | /* | ||
2062 | * FIXME: This is going to break if dioread_nolock | ||
2063 | * supports blocksize < pagesize as we will try to | ||
2064 | * convert potentially unmapped parts of inode. | ||
2065 | */ | ||
2066 | mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; | ||
2067 | /* Page fully mapped - let IO run! */ | ||
2068 | err = mpage_submit_page(mpd, page); | ||
2069 | if (err < 0) { | ||
2070 | pagevec_release(&pvec); | ||
2071 | return err; | ||
2072 | } | ||
2073 | start++; | ||
2074 | } | ||
2075 | pagevec_release(&pvec); | ||
2076 | } | ||
2077 | /* Extent fully mapped and matches with page boundary. We are done. */ | ||
2078 | mpd->map.m_len = 0; | ||
2079 | mpd->map.m_flags = 0; | ||
2080 | return 0; | ||
2081 | } | ||
2082 | |||
2083 | static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) | ||
2084 | { | ||
2085 | struct inode *inode = mpd->inode; | ||
2086 | struct ext4_map_blocks *map = &mpd->map; | ||
2087 | int get_blocks_flags; | ||
2088 | int err; | ||
2089 | |||
2090 | trace_ext4_da_write_pages_extent(inode, map); | ||
2255 | /* | 2091 | /* |
2256 | * With non-extent format the journal credit needed to | 2092 | * Call ext4_map_blocks() to allocate any delayed allocation blocks, or |
2257 | * insert nrblocks contiguous block is dependent on | 2093 | * to convert an uninitialized extent to be initialized (in the case |
2258 | * number of contiguous block. So we will limit | 2094 | * where we have written into one or more preallocated blocks). It is |
2259 | * number of contiguous block to a sane value | 2095 | * possible that we're going to need more metadata blocks than |
2096 | * previously reserved. However we must not fail because we're in | ||
2097 | * writeback and there is nothing we can do about it so it might result | ||
2098 | * in data loss. So use reserved blocks to allocate metadata if | ||
2099 | * possible. | ||
2100 | * | ||
2101 | * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks | ||
2102 | * in question are delalloc blocks. This affects functions in many | ||
2103 | * different parts of the allocation call path. This flag exists | ||
2104 | * primarily because we don't want to change *many* call functions, so | ||
2105 | * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag | ||
2106 | * once the inode's allocation semaphore is taken. | ||
2260 | */ | 2107 | */ |
2261 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && | 2108 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE | |
2262 | (max_blocks > EXT4_MAX_TRANS_DATA)) | 2109 | EXT4_GET_BLOCKS_METADATA_NOFAIL; |
2263 | max_blocks = EXT4_MAX_TRANS_DATA; | 2110 | if (ext4_should_dioread_nolock(inode)) |
2111 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; | ||
2112 | if (map->m_flags & (1 << BH_Delay)) | ||
2113 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; | ||
2264 | 2114 | ||
2265 | return ext4_chunk_trans_blocks(inode, max_blocks); | 2115 | err = ext4_map_blocks(handle, inode, map, get_blocks_flags); |
2116 | if (err < 0) | ||
2117 | return err; | ||
2118 | if (map->m_flags & EXT4_MAP_UNINIT) { | ||
2119 | if (!mpd->io_submit.io_end->handle && | ||
2120 | ext4_handle_valid(handle)) { | ||
2121 | mpd->io_submit.io_end->handle = handle->h_rsv_handle; | ||
2122 | handle->h_rsv_handle = NULL; | ||
2123 | } | ||
2124 | ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); | ||
2125 | } | ||
2126 | |||
2127 | BUG_ON(map->m_len == 0); | ||
2128 | if (map->m_flags & EXT4_MAP_NEW) { | ||
2129 | struct block_device *bdev = inode->i_sb->s_bdev; | ||
2130 | int i; | ||
2131 | |||
2132 | for (i = 0; i < map->m_len; i++) | ||
2133 | unmap_underlying_metadata(bdev, map->m_pblk + i); | ||
2134 | } | ||
2135 | return 0; | ||
2266 | } | 2136 | } |
2267 | 2137 | ||
2268 | /* | 2138 | /* |
2269 | * write_cache_pages_da - walk the list of dirty pages of the given | 2139 | * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length |
2270 | * address space and accumulate pages that need writing, and call | 2140 | * mpd->len and submit pages underlying it for IO |
2271 | * mpage_da_map_and_submit to map a single contiguous memory region | 2141 | * |
2272 | * and then write them. | 2142 | * @handle - handle for journal operations |
2143 | * @mpd - extent to map | ||
2144 | * | ||
2145 | * The function maps extent starting at mpd->lblk of length mpd->len. If it is | ||
2146 | * delayed, blocks are allocated, if it is unwritten, we may need to convert | ||
2147 | * them to initialized or split the described range from larger unwritten | ||
2148 | * extent. Note that we need not map all the described range since allocation | ||
2149 | * can return less blocks or the range is covered by more unwritten extents. We | ||
2150 | * cannot map more because we are limited by reserved transaction credits. On | ||
2151 | * the other hand we always make sure that the last touched page is fully | ||
2152 | * mapped so that it can be written out (and thus forward progress is | ||
2153 | * guaranteed). After mapping we submit all mapped pages for IO. | ||
2273 | */ | 2154 | */ |
2274 | static int write_cache_pages_da(handle_t *handle, | 2155 | static int mpage_map_and_submit_extent(handle_t *handle, |
2275 | struct address_space *mapping, | 2156 | struct mpage_da_data *mpd, |
2276 | struct writeback_control *wbc, | 2157 | bool *give_up_on_write) |
2277 | struct mpage_da_data *mpd, | ||
2278 | pgoff_t *done_index) | ||
2279 | { | 2158 | { |
2280 | struct buffer_head *bh, *head; | 2159 | struct inode *inode = mpd->inode; |
2281 | struct inode *inode = mapping->host; | 2160 | struct ext4_map_blocks *map = &mpd->map; |
2282 | struct pagevec pvec; | 2161 | int err; |
2283 | unsigned int nr_pages; | 2162 | loff_t disksize; |
2284 | sector_t logical; | ||
2285 | pgoff_t index, end; | ||
2286 | long nr_to_write = wbc->nr_to_write; | ||
2287 | int i, tag, ret = 0; | ||
2288 | |||
2289 | memset(mpd, 0, sizeof(struct mpage_da_data)); | ||
2290 | mpd->wbc = wbc; | ||
2291 | mpd->inode = inode; | ||
2292 | pagevec_init(&pvec, 0); | ||
2293 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | ||
2294 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | ||
2295 | 2163 | ||
2296 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | 2164 | mpd->io_submit.io_end->offset = |
2165 | ((loff_t)map->m_lblk) << inode->i_blkbits; | ||
2166 | while (map->m_len) { | ||
2167 | err = mpage_map_one_extent(handle, mpd); | ||
2168 | if (err < 0) { | ||
2169 | struct super_block *sb = inode->i_sb; | ||
2170 | |||
2171 | if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) | ||
2172 | goto invalidate_dirty_pages; | ||
2173 | /* | ||
2174 | * Let the uper layers retry transient errors. | ||
2175 | * In the case of ENOSPC, if ext4_count_free_blocks() | ||
2176 | * is non-zero, a commit should free up blocks. | ||
2177 | */ | ||
2178 | if ((err == -ENOMEM) || | ||
2179 | (err == -ENOSPC && ext4_count_free_clusters(sb))) | ||
2180 | return err; | ||
2181 | ext4_msg(sb, KERN_CRIT, | ||
2182 | "Delayed block allocation failed for " | ||
2183 | "inode %lu at logical offset %llu with" | ||
2184 | " max blocks %u with error %d", | ||
2185 | inode->i_ino, | ||
2186 | (unsigned long long)map->m_lblk, | ||
2187 | (unsigned)map->m_len, -err); | ||
2188 | ext4_msg(sb, KERN_CRIT, | ||
2189 | "This should not happen!! Data will " | ||
2190 | "be lost\n"); | ||
2191 | if (err == -ENOSPC) | ||
2192 | ext4_print_free_blocks(inode); | ||
2193 | invalidate_dirty_pages: | ||
2194 | *give_up_on_write = true; | ||
2195 | return err; | ||
2196 | } | ||
2197 | /* | ||
2198 | * Update buffer state, submit mapped pages, and get us new | ||
2199 | * extent to map | ||
2200 | */ | ||
2201 | err = mpage_map_and_submit_buffers(mpd); | ||
2202 | if (err < 0) | ||
2203 | return err; | ||
2204 | } | ||
2205 | |||
2206 | /* Update on-disk size after IO is submitted */ | ||
2207 | disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; | ||
2208 | if (disksize > i_size_read(inode)) | ||
2209 | disksize = i_size_read(inode); | ||
2210 | if (disksize > EXT4_I(inode)->i_disksize) { | ||
2211 | int err2; | ||
2212 | |||
2213 | ext4_update_i_disksize(inode, disksize); | ||
2214 | err2 = ext4_mark_inode_dirty(handle, inode); | ||
2215 | if (err2) | ||
2216 | ext4_error(inode->i_sb, | ||
2217 | "Failed to mark inode %lu dirty", | ||
2218 | inode->i_ino); | ||
2219 | if (!err) | ||
2220 | err = err2; | ||
2221 | } | ||
2222 | return err; | ||
2223 | } | ||
2224 | |||
2225 | /* | ||
2226 | * Calculate the total number of credits to reserve for one writepages | ||
2227 | * iteration. This is called from ext4_writepages(). We map an extent of | ||
2228 | * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping | ||
2229 | * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + | ||
2230 | * bpp - 1 blocks in bpp different extents. | ||
2231 | */ | ||
2232 | static int ext4_da_writepages_trans_blocks(struct inode *inode) | ||
2233 | { | ||
2234 | int bpp = ext4_journal_blocks_per_page(inode); | ||
2235 | |||
2236 | return ext4_meta_trans_blocks(inode, | ||
2237 | MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); | ||
2238 | } | ||
2239 | |||
2240 | /* | ||
2241 | * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages | ||
2242 | * and underlying extent to map | ||
2243 | * | ||
2244 | * @mpd - where to look for pages | ||
2245 | * | ||
2246 | * Walk dirty pages in the mapping. If they are fully mapped, submit them for | ||
2247 | * IO immediately. When we find a page which isn't mapped we start accumulating | ||
2248 | * extent of buffers underlying these pages that needs mapping (formed by | ||
2249 | * either delayed or unwritten buffers). We also lock the pages containing | ||
2250 | * these buffers. The extent found is returned in @mpd structure (starting at | ||
2251 | * mpd->lblk with length mpd->len blocks). | ||
2252 | * | ||
2253 | * Note that this function can attach bios to one io_end structure which are | ||
2254 | * neither logically nor physically contiguous. Although it may seem as an | ||
2255 | * unnecessary complication, it is actually inevitable in blocksize < pagesize | ||
2256 | * case as we need to track IO to all buffers underlying a page in one io_end. | ||
2257 | */ | ||
2258 | static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) | ||
2259 | { | ||
2260 | struct address_space *mapping = mpd->inode->i_mapping; | ||
2261 | struct pagevec pvec; | ||
2262 | unsigned int nr_pages; | ||
2263 | pgoff_t index = mpd->first_page; | ||
2264 | pgoff_t end = mpd->last_page; | ||
2265 | int tag; | ||
2266 | int i, err = 0; | ||
2267 | int blkbits = mpd->inode->i_blkbits; | ||
2268 | ext4_lblk_t lblk; | ||
2269 | struct buffer_head *head; | ||
2270 | |||
2271 | if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) | ||
2297 | tag = PAGECACHE_TAG_TOWRITE; | 2272 | tag = PAGECACHE_TAG_TOWRITE; |
2298 | else | 2273 | else |
2299 | tag = PAGECACHE_TAG_DIRTY; | 2274 | tag = PAGECACHE_TAG_DIRTY; |
2300 | 2275 | ||
2301 | *done_index = index; | 2276 | pagevec_init(&pvec, 0); |
2277 | mpd->map.m_len = 0; | ||
2278 | mpd->next_page = index; | ||
2302 | while (index <= end) { | 2279 | while (index <= end) { |
2303 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, | 2280 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, |
2304 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | 2281 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); |
2305 | if (nr_pages == 0) | 2282 | if (nr_pages == 0) |
2306 | return 0; | 2283 | goto out; |
2307 | 2284 | ||
2308 | for (i = 0; i < nr_pages; i++) { | 2285 | for (i = 0; i < nr_pages; i++) { |
2309 | struct page *page = pvec.pages[i]; | 2286 | struct page *page = pvec.pages[i]; |
@@ -2318,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle, | |||
2318 | if (page->index > end) | 2295 | if (page->index > end) |
2319 | goto out; | 2296 | goto out; |
2320 | 2297 | ||
2321 | *done_index = page->index + 1; | 2298 | /* If we can't merge this page, we are done. */ |
2322 | 2299 | if (mpd->map.m_len > 0 && mpd->next_page != page->index) | |
2323 | /* | 2300 | goto out; |
2324 | * If we can't merge this page, and we have | ||
2325 | * accumulated an contiguous region, write it | ||
2326 | */ | ||
2327 | if ((mpd->next_page != page->index) && | ||
2328 | (mpd->next_page != mpd->first_page)) { | ||
2329 | mpage_da_map_and_submit(mpd); | ||
2330 | goto ret_extent_tail; | ||
2331 | } | ||
2332 | 2301 | ||
2333 | lock_page(page); | 2302 | lock_page(page); |
2334 | |||
2335 | /* | 2303 | /* |
2336 | * If the page is no longer dirty, or its | 2304 | * If the page is no longer dirty, or its mapping no |
2337 | * mapping no longer corresponds to inode we | 2305 | * longer corresponds to inode we are writing (which |
2338 | * are writing (which means it has been | 2306 | * means it has been truncated or invalidated), or the |
2339 | * truncated or invalidated), or the page is | 2307 | * page is already under writeback and we are not doing |
2340 | * already under writeback and we are not | 2308 | * a data integrity writeback, skip the page |
2341 | * doing a data integrity writeback, skip the page | ||
2342 | */ | 2309 | */ |
2343 | if (!PageDirty(page) || | 2310 | if (!PageDirty(page) || |
2344 | (PageWriteback(page) && | 2311 | (PageWriteback(page) && |
2345 | (wbc->sync_mode == WB_SYNC_NONE)) || | 2312 | (mpd->wbc->sync_mode == WB_SYNC_NONE)) || |
2346 | unlikely(page->mapping != mapping)) { | 2313 | unlikely(page->mapping != mapping)) { |
2347 | unlock_page(page); | 2314 | unlock_page(page); |
2348 | continue; | 2315 | continue; |
@@ -2351,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle, | |||
2351 | wait_on_page_writeback(page); | 2318 | wait_on_page_writeback(page); |
2352 | BUG_ON(PageWriteback(page)); | 2319 | BUG_ON(PageWriteback(page)); |
2353 | 2320 | ||
2354 | /* | 2321 | if (mpd->map.m_len == 0) |
2355 | * If we have inline data and arrive here, it means that | ||
2356 | * we will soon create the block for the 1st page, so | ||
2357 | * we'd better clear the inline data here. | ||
2358 | */ | ||
2359 | if (ext4_has_inline_data(inode)) { | ||
2360 | BUG_ON(ext4_test_inode_state(inode, | ||
2361 | EXT4_STATE_MAY_INLINE_DATA)); | ||
2362 | ext4_destroy_inline_data(handle, inode); | ||
2363 | } | ||
2364 | |||
2365 | if (mpd->next_page != page->index) | ||
2366 | mpd->first_page = page->index; | 2322 | mpd->first_page = page->index; |
2367 | mpd->next_page = page->index + 1; | 2323 | mpd->next_page = page->index + 1; |
2368 | logical = (sector_t) page->index << | ||
2369 | (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
2370 | |||
2371 | /* Add all dirty buffers to mpd */ | 2324 | /* Add all dirty buffers to mpd */ |
2325 | lblk = ((ext4_lblk_t)page->index) << | ||
2326 | (PAGE_CACHE_SHIFT - blkbits); | ||
2372 | head = page_buffers(page); | 2327 | head = page_buffers(page); |
2373 | bh = head; | 2328 | if (!add_page_bufs_to_extent(mpd, head, head, lblk)) |
2374 | do { | 2329 | goto out; |
2375 | BUG_ON(buffer_locked(bh)); | 2330 | /* So far everything mapped? Submit the page for IO. */ |
2376 | /* | 2331 | if (mpd->map.m_len == 0) { |
2377 | * We need to try to allocate unmapped blocks | 2332 | err = mpage_submit_page(mpd, page); |
2378 | * in the same page. Otherwise we won't make | 2333 | if (err < 0) |
2379 | * progress with the page in ext4_writepage | ||
2380 | */ | ||
2381 | if (ext4_bh_delay_or_unwritten(NULL, bh)) { | ||
2382 | mpage_add_bh_to_extent(mpd, logical, | ||
2383 | bh->b_state); | ||
2384 | if (mpd->io_done) | ||
2385 | goto ret_extent_tail; | ||
2386 | } else if (buffer_dirty(bh) && | ||
2387 | buffer_mapped(bh)) { | ||
2388 | /* | ||
2389 | * mapped dirty buffer. We need to | ||
2390 | * update the b_state because we look | ||
2391 | * at b_state in mpage_da_map_blocks. | ||
2392 | * We don't update b_size because if we | ||
2393 | * find an unmapped buffer_head later | ||
2394 | * we need to use the b_state flag of | ||
2395 | * that buffer_head. | ||
2396 | */ | ||
2397 | if (mpd->b_size == 0) | ||
2398 | mpd->b_state = | ||
2399 | bh->b_state & BH_FLAGS; | ||
2400 | } | ||
2401 | logical++; | ||
2402 | } while ((bh = bh->b_this_page) != head); | ||
2403 | |||
2404 | if (nr_to_write > 0) { | ||
2405 | nr_to_write--; | ||
2406 | if (nr_to_write == 0 && | ||
2407 | wbc->sync_mode == WB_SYNC_NONE) | ||
2408 | /* | ||
2409 | * We stop writing back only if we are | ||
2410 | * not doing integrity sync. In case of | ||
2411 | * integrity sync we have to keep going | ||
2412 | * because someone may be concurrently | ||
2413 | * dirtying pages, and we might have | ||
2414 | * synced a lot of newly appeared dirty | ||
2415 | * pages, but have not synced all of the | ||
2416 | * old dirty pages. | ||
2417 | */ | ||
2418 | goto out; | 2334 | goto out; |
2419 | } | 2335 | } |
2336 | |||
2337 | /* | ||
2338 | * Accumulated enough dirty pages? This doesn't apply | ||
2339 | * to WB_SYNC_ALL mode. For integrity sync we have to | ||
2340 | * keep going because someone may be concurrently | ||
2341 | * dirtying pages, and we might have synced a lot of | ||
2342 | * newly appeared dirty pages, but have not synced all | ||
2343 | * of the old dirty pages. | ||
2344 | */ | ||
2345 | if (mpd->wbc->sync_mode == WB_SYNC_NONE && | ||
2346 | mpd->next_page - mpd->first_page >= | ||
2347 | mpd->wbc->nr_to_write) | ||
2348 | goto out; | ||
2420 | } | 2349 | } |
2421 | pagevec_release(&pvec); | 2350 | pagevec_release(&pvec); |
2422 | cond_resched(); | 2351 | cond_resched(); |
2423 | } | 2352 | } |
2424 | return 0; | 2353 | return 0; |
2425 | ret_extent_tail: | ||
2426 | ret = MPAGE_DA_EXTENT_TAIL; | ||
2427 | out: | 2354 | out: |
2428 | pagevec_release(&pvec); | 2355 | pagevec_release(&pvec); |
2429 | cond_resched(); | 2356 | return err; |
2430 | return ret; | ||
2431 | } | 2357 | } |
2432 | 2358 | ||
2359 | static int __writepage(struct page *page, struct writeback_control *wbc, | ||
2360 | void *data) | ||
2361 | { | ||
2362 | struct address_space *mapping = data; | ||
2363 | int ret = ext4_writepage(page, wbc); | ||
2364 | mapping_set_error(mapping, ret); | ||
2365 | return ret; | ||
2366 | } | ||
2433 | 2367 | ||
2434 | static int ext4_da_writepages(struct address_space *mapping, | 2368 | static int ext4_writepages(struct address_space *mapping, |
2435 | struct writeback_control *wbc) | 2369 | struct writeback_control *wbc) |
2436 | { | 2370 | { |
2437 | pgoff_t index; | 2371 | pgoff_t writeback_index = 0; |
2372 | long nr_to_write = wbc->nr_to_write; | ||
2438 | int range_whole = 0; | 2373 | int range_whole = 0; |
2374 | int cycled = 1; | ||
2439 | handle_t *handle = NULL; | 2375 | handle_t *handle = NULL; |
2440 | struct mpage_da_data mpd; | 2376 | struct mpage_da_data mpd; |
2441 | struct inode *inode = mapping->host; | 2377 | struct inode *inode = mapping->host; |
2442 | int pages_written = 0; | 2378 | int needed_blocks, rsv_blocks = 0, ret = 0; |
2443 | unsigned int max_pages; | ||
2444 | int range_cyclic, cycled = 1, io_done = 0; | ||
2445 | int needed_blocks, ret = 0; | ||
2446 | long desired_nr_to_write, nr_to_writebump = 0; | ||
2447 | loff_t range_start = wbc->range_start; | ||
2448 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | 2379 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
2449 | pgoff_t done_index = 0; | 2380 | bool done; |
2450 | pgoff_t end; | ||
2451 | struct blk_plug plug; | 2381 | struct blk_plug plug; |
2382 | bool give_up_on_write = false; | ||
2452 | 2383 | ||
2453 | trace_ext4_da_writepages(inode, wbc); | 2384 | trace_ext4_writepages(inode, wbc); |
2454 | 2385 | ||
2455 | /* | 2386 | /* |
2456 | * No pages to write? This is mainly a kludge to avoid starting | 2387 | * No pages to write? This is mainly a kludge to avoid starting |
@@ -2460,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2460 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) | 2391 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) |
2461 | return 0; | 2392 | return 0; |
2462 | 2393 | ||
2394 | if (ext4_should_journal_data(inode)) { | ||
2395 | struct blk_plug plug; | ||
2396 | int ret; | ||
2397 | |||
2398 | blk_start_plug(&plug); | ||
2399 | ret = write_cache_pages(mapping, wbc, __writepage, mapping); | ||
2400 | blk_finish_plug(&plug); | ||
2401 | return ret; | ||
2402 | } | ||
2403 | |||
2463 | /* | 2404 | /* |
2464 | * If the filesystem has aborted, it is read-only, so return | 2405 | * If the filesystem has aborted, it is read-only, so return |
2465 | * right away instead of dumping stack traces later on that | 2406 | * right away instead of dumping stack traces later on that |
2466 | * will obscure the real source of the problem. We test | 2407 | * will obscure the real source of the problem. We test |
2467 | * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because | 2408 | * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because |
2468 | * the latter could be true if the filesystem is mounted | 2409 | * the latter could be true if the filesystem is mounted |
2469 | * read-only, and in that case, ext4_da_writepages should | 2410 | * read-only, and in that case, ext4_writepages should |
2470 | * *never* be called, so if that ever happens, we would want | 2411 | * *never* be called, so if that ever happens, we would want |
2471 | * the stack trace. | 2412 | * the stack trace. |
2472 | */ | 2413 | */ |
2473 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) | 2414 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) |
2474 | return -EROFS; | 2415 | return -EROFS; |
2475 | 2416 | ||
2417 | if (ext4_should_dioread_nolock(inode)) { | ||
2418 | /* | ||
2419 | * We may need to convert upto one extent per block in | ||
2420 | * the page and we may dirty the inode. | ||
2421 | */ | ||
2422 | rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); | ||
2423 | } | ||
2424 | |||
2425 | /* | ||
2426 | * If we have inline data and arrive here, it means that | ||
2427 | * we will soon create the block for the 1st page, so | ||
2428 | * we'd better clear the inline data here. | ||
2429 | */ | ||
2430 | if (ext4_has_inline_data(inode)) { | ||
2431 | /* Just inode will be modified... */ | ||
2432 | handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); | ||
2433 | if (IS_ERR(handle)) { | ||
2434 | ret = PTR_ERR(handle); | ||
2435 | goto out_writepages; | ||
2436 | } | ||
2437 | BUG_ON(ext4_test_inode_state(inode, | ||
2438 | EXT4_STATE_MAY_INLINE_DATA)); | ||
2439 | ext4_destroy_inline_data(handle, inode); | ||
2440 | ext4_journal_stop(handle); | ||
2441 | } | ||
2442 | |||
2476 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 2443 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
2477 | range_whole = 1; | 2444 | range_whole = 1; |
2478 | 2445 | ||
2479 | range_cyclic = wbc->range_cyclic; | ||
2480 | if (wbc->range_cyclic) { | 2446 | if (wbc->range_cyclic) { |
2481 | index = mapping->writeback_index; | 2447 | writeback_index = mapping->writeback_index; |
2482 | if (index) | 2448 | if (writeback_index) |
2483 | cycled = 0; | 2449 | cycled = 0; |
2484 | wbc->range_start = index << PAGE_CACHE_SHIFT; | 2450 | mpd.first_page = writeback_index; |
2485 | wbc->range_end = LLONG_MAX; | 2451 | mpd.last_page = -1; |
2486 | wbc->range_cyclic = 0; | ||
2487 | end = -1; | ||
2488 | } else { | 2452 | } else { |
2489 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2453 | mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; |
2490 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 2454 | mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; |
2491 | } | ||
2492 | |||
2493 | /* | ||
2494 | * This works around two forms of stupidity. The first is in | ||
2495 | * the writeback code, which caps the maximum number of pages | ||
2496 | * written to be 1024 pages. This is wrong on multiple | ||
2497 | * levels; different architectues have a different page size, | ||
2498 | * which changes the maximum amount of data which gets | ||
2499 | * written. Secondly, 4 megabytes is way too small. XFS | ||
2500 | * forces this value to be 16 megabytes by multiplying | ||
2501 | * nr_to_write parameter by four, and then relies on its | ||
2502 | * allocator to allocate larger extents to make them | ||
2503 | * contiguous. Unfortunately this brings us to the second | ||
2504 | * stupidity, which is that ext4's mballoc code only allocates | ||
2505 | * at most 2048 blocks. So we force contiguous writes up to | ||
2506 | * the number of dirty blocks in the inode, or | ||
2507 | * sbi->max_writeback_mb_bump whichever is smaller. | ||
2508 | */ | ||
2509 | max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); | ||
2510 | if (!range_cyclic && range_whole) { | ||
2511 | if (wbc->nr_to_write == LONG_MAX) | ||
2512 | desired_nr_to_write = wbc->nr_to_write; | ||
2513 | else | ||
2514 | desired_nr_to_write = wbc->nr_to_write * 8; | ||
2515 | } else | ||
2516 | desired_nr_to_write = ext4_num_dirty_pages(inode, index, | ||
2517 | max_pages); | ||
2518 | if (desired_nr_to_write > max_pages) | ||
2519 | desired_nr_to_write = max_pages; | ||
2520 | |||
2521 | if (wbc->nr_to_write < desired_nr_to_write) { | ||
2522 | nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; | ||
2523 | wbc->nr_to_write = desired_nr_to_write; | ||
2524 | } | 2455 | } |
2525 | 2456 | ||
2457 | mpd.inode = inode; | ||
2458 | mpd.wbc = wbc; | ||
2459 | ext4_io_submit_init(&mpd.io_submit, wbc); | ||
2526 | retry: | 2460 | retry: |
2527 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | 2461 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2528 | tag_pages_for_writeback(mapping, index, end); | 2462 | tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); |
2529 | 2463 | done = false; | |
2530 | blk_start_plug(&plug); | 2464 | blk_start_plug(&plug); |
2531 | while (!ret && wbc->nr_to_write > 0) { | 2465 | while (!done && mpd.first_page <= mpd.last_page) { |
2466 | /* For each extent of pages we use new io_end */ | ||
2467 | mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); | ||
2468 | if (!mpd.io_submit.io_end) { | ||
2469 | ret = -ENOMEM; | ||
2470 | break; | ||
2471 | } | ||
2532 | 2472 | ||
2533 | /* | 2473 | /* |
2534 | * we insert one extent at a time. So we need | 2474 | * We have two constraints: We find one extent to map and we |
2535 | * credit needed for single extent allocation. | 2475 | * must always write out whole page (makes a difference when |
2536 | * journalled mode is currently not supported | 2476 | * blocksize < pagesize) so that we don't block on IO when we |
2537 | * by delalloc | 2477 | * try to write out the rest of the page. Journalled mode is |
2478 | * not supported by delalloc. | ||
2538 | */ | 2479 | */ |
2539 | BUG_ON(ext4_should_journal_data(inode)); | 2480 | BUG_ON(ext4_should_journal_data(inode)); |
2540 | needed_blocks = ext4_da_writepages_trans_blocks(inode); | 2481 | needed_blocks = ext4_da_writepages_trans_blocks(inode); |
2541 | 2482 | ||
2542 | /* start a new transaction*/ | 2483 | /* start a new transaction */ |
2543 | handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, | 2484 | handle = ext4_journal_start_with_reserve(inode, |
2544 | needed_blocks); | 2485 | EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); |
2545 | if (IS_ERR(handle)) { | 2486 | if (IS_ERR(handle)) { |
2546 | ret = PTR_ERR(handle); | 2487 | ret = PTR_ERR(handle); |
2547 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " | 2488 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " |
2548 | "%ld pages, ino %lu; err %d", __func__, | 2489 | "%ld pages, ino %lu; err %d", __func__, |
2549 | wbc->nr_to_write, inode->i_ino, ret); | 2490 | wbc->nr_to_write, inode->i_ino, ret); |
2550 | blk_finish_plug(&plug); | 2491 | /* Release allocated io_end */ |
2551 | goto out_writepages; | 2492 | ext4_put_io_end(mpd.io_submit.io_end); |
2493 | break; | ||
2552 | } | 2494 | } |
2553 | 2495 | ||
2554 | /* | 2496 | trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); |
2555 | * Now call write_cache_pages_da() to find the next | 2497 | ret = mpage_prepare_extent_to_map(&mpd); |
2556 | * contiguous region of logical blocks that need | 2498 | if (!ret) { |
2557 | * blocks to be allocated by ext4 and submit them. | 2499 | if (mpd.map.m_len) |
2558 | */ | 2500 | ret = mpage_map_and_submit_extent(handle, &mpd, |
2559 | ret = write_cache_pages_da(handle, mapping, | 2501 | &give_up_on_write); |
2560 | wbc, &mpd, &done_index); | 2502 | else { |
2561 | /* | 2503 | /* |
2562 | * If we have a contiguous extent of pages and we | 2504 | * We scanned the whole range (or exhausted |
2563 | * haven't done the I/O yet, map the blocks and submit | 2505 | * nr_to_write), submitted what was mapped and |
2564 | * them for I/O. | 2506 | * didn't find anything needing mapping. We are |
2565 | */ | 2507 | * done. |
2566 | if (!mpd.io_done && mpd.next_page != mpd.first_page) { | 2508 | */ |
2567 | mpage_da_map_and_submit(&mpd); | 2509 | done = true; |
2568 | ret = MPAGE_DA_EXTENT_TAIL; | 2510 | } |
2569 | } | 2511 | } |
2570 | trace_ext4_da_write_pages(inode, &mpd); | ||
2571 | wbc->nr_to_write -= mpd.pages_written; | ||
2572 | |||
2573 | ext4_journal_stop(handle); | 2512 | ext4_journal_stop(handle); |
2574 | 2513 | /* Submit prepared bio */ | |
2575 | if ((mpd.retval == -ENOSPC) && sbi->s_journal) { | 2514 | ext4_io_submit(&mpd.io_submit); |
2576 | /* commit the transaction which would | 2515 | /* Unlock pages we didn't use */ |
2516 | mpage_release_unused_pages(&mpd, give_up_on_write); | ||
2517 | /* Drop our io_end reference we got from init */ | ||
2518 | ext4_put_io_end(mpd.io_submit.io_end); | ||
2519 | |||
2520 | if (ret == -ENOSPC && sbi->s_journal) { | ||
2521 | /* | ||
2522 | * Commit the transaction which would | ||
2577 | * free blocks released in the transaction | 2523 | * free blocks released in the transaction |
2578 | * and try again | 2524 | * and try again |
2579 | */ | 2525 | */ |
2580 | jbd2_journal_force_commit_nested(sbi->s_journal); | 2526 | jbd2_journal_force_commit_nested(sbi->s_journal); |
2581 | ret = 0; | 2527 | ret = 0; |
2582 | } else if (ret == MPAGE_DA_EXTENT_TAIL) { | 2528 | continue; |
2583 | /* | 2529 | } |
2584 | * Got one extent now try with rest of the pages. | 2530 | /* Fatal error - ENOMEM, EIO... */ |
2585 | * If mpd.retval is set -EIO, journal is aborted. | 2531 | if (ret) |
2586 | * So we don't need to write any more. | ||
2587 | */ | ||
2588 | pages_written += mpd.pages_written; | ||
2589 | ret = mpd.retval; | ||
2590 | io_done = 1; | ||
2591 | } else if (wbc->nr_to_write) | ||
2592 | /* | ||
2593 | * There is no more writeout needed | ||
2594 | * or we requested for a noblocking writeout | ||
2595 | * and we found the device congested | ||
2596 | */ | ||
2597 | break; | 2532 | break; |
2598 | } | 2533 | } |
2599 | blk_finish_plug(&plug); | 2534 | blk_finish_plug(&plug); |
2600 | if (!io_done && !cycled) { | 2535 | if (!ret && !cycled) { |
2601 | cycled = 1; | 2536 | cycled = 1; |
2602 | index = 0; | 2537 | mpd.last_page = writeback_index - 1; |
2603 | wbc->range_start = index << PAGE_CACHE_SHIFT; | 2538 | mpd.first_page = 0; |
2604 | wbc->range_end = mapping->writeback_index - 1; | ||
2605 | goto retry; | 2539 | goto retry; |
2606 | } | 2540 | } |
2607 | 2541 | ||
2608 | /* Update index */ | 2542 | /* Update index */ |
2609 | wbc->range_cyclic = range_cyclic; | ||
2610 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 2543 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
2611 | /* | 2544 | /* |
2612 | * set the writeback_index so that range_cyclic | 2545 | * Set the writeback_index so that range_cyclic |
2613 | * mode will write it back later | 2546 | * mode will write it back later |
2614 | */ | 2547 | */ |
2615 | mapping->writeback_index = done_index; | 2548 | mapping->writeback_index = mpd.first_page; |
2616 | 2549 | ||
2617 | out_writepages: | 2550 | out_writepages: |
2618 | wbc->nr_to_write -= nr_to_writebump; | 2551 | trace_ext4_writepages_result(inode, wbc, ret, |
2619 | wbc->range_start = range_start; | 2552 | nr_to_write - wbc->nr_to_write); |
2620 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); | ||
2621 | return ret; | 2553 | return ret; |
2622 | } | 2554 | } |
2623 | 2555 | ||
@@ -2829,7 +2761,8 @@ static int ext4_da_write_end(struct file *file, | |||
2829 | return ret ? ret : copied; | 2761 | return ret ? ret : copied; |
2830 | } | 2762 | } |
2831 | 2763 | ||
2832 | static void ext4_da_invalidatepage(struct page *page, unsigned long offset) | 2764 | static void ext4_da_invalidatepage(struct page *page, unsigned int offset, |
2765 | unsigned int length) | ||
2833 | { | 2766 | { |
2834 | /* | 2767 | /* |
2835 | * Drop reserved blocks | 2768 | * Drop reserved blocks |
@@ -2838,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset) | |||
2838 | if (!page_has_buffers(page)) | 2771 | if (!page_has_buffers(page)) |
2839 | goto out; | 2772 | goto out; |
2840 | 2773 | ||
2841 | ext4_da_page_release_reservation(page, offset); | 2774 | ext4_da_page_release_reservation(page, offset, length); |
2842 | 2775 | ||
2843 | out: | 2776 | out: |
2844 | ext4_invalidatepage(page, offset); | 2777 | ext4_invalidatepage(page, offset, length); |
2845 | 2778 | ||
2846 | return; | 2779 | return; |
2847 | } | 2780 | } |
@@ -2864,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode) | |||
2864 | * laptop_mode, not even desirable). However, to do otherwise | 2797 | * laptop_mode, not even desirable). However, to do otherwise |
2865 | * would require replicating code paths in: | 2798 | * would require replicating code paths in: |
2866 | * | 2799 | * |
2867 | * ext4_da_writepages() -> | 2800 | * ext4_writepages() -> |
2868 | * write_cache_pages() ---> (via passed in callback function) | 2801 | * write_cache_pages() ---> (via passed in callback function) |
2869 | * __mpage_da_writepage() --> | 2802 | * __mpage_da_writepage() --> |
2870 | * mpage_add_bh_to_extent() | 2803 | * mpage_add_bh_to_extent() |
@@ -2989,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping, | |||
2989 | return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); | 2922 | return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); |
2990 | } | 2923 | } |
2991 | 2924 | ||
2992 | static void ext4_invalidatepage(struct page *page, unsigned long offset) | 2925 | static void ext4_invalidatepage(struct page *page, unsigned int offset, |
2926 | unsigned int length) | ||
2993 | { | 2927 | { |
2994 | trace_ext4_invalidatepage(page, offset); | 2928 | trace_ext4_invalidatepage(page, offset, length); |
2995 | 2929 | ||
2996 | /* No journalling happens on data buffers when this function is used */ | 2930 | /* No journalling happens on data buffers when this function is used */ |
2997 | WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); | 2931 | WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); |
2998 | 2932 | ||
2999 | block_invalidatepage(page, offset); | 2933 | block_invalidatepage(page, offset, length); |
3000 | } | 2934 | } |
3001 | 2935 | ||
3002 | static int __ext4_journalled_invalidatepage(struct page *page, | 2936 | static int __ext4_journalled_invalidatepage(struct page *page, |
3003 | unsigned long offset) | 2937 | unsigned int offset, |
2938 | unsigned int length) | ||
3004 | { | 2939 | { |
3005 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); | 2940 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); |
3006 | 2941 | ||
3007 | trace_ext4_journalled_invalidatepage(page, offset); | 2942 | trace_ext4_journalled_invalidatepage(page, offset, length); |
3008 | 2943 | ||
3009 | /* | 2944 | /* |
3010 | * If it's a full truncate we just forget about the pending dirtying | 2945 | * If it's a full truncate we just forget about the pending dirtying |
3011 | */ | 2946 | */ |
3012 | if (offset == 0) | 2947 | if (offset == 0 && length == PAGE_CACHE_SIZE) |
3013 | ClearPageChecked(page); | 2948 | ClearPageChecked(page); |
3014 | 2949 | ||
3015 | return jbd2_journal_invalidatepage(journal, page, offset); | 2950 | return jbd2_journal_invalidatepage(journal, page, offset, length); |
3016 | } | 2951 | } |
3017 | 2952 | ||
3018 | /* Wrapper for aops... */ | 2953 | /* Wrapper for aops... */ |
3019 | static void ext4_journalled_invalidatepage(struct page *page, | 2954 | static void ext4_journalled_invalidatepage(struct page *page, |
3020 | unsigned long offset) | 2955 | unsigned int offset, |
2956 | unsigned int length) | ||
3021 | { | 2957 | { |
3022 | WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); | 2958 | WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); |
3023 | } | 2959 | } |
3024 | 2960 | ||
3025 | static int ext4_releasepage(struct page *page, gfp_t wait) | 2961 | static int ext4_releasepage(struct page *page, gfp_t wait) |
@@ -3067,9 +3003,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3067 | struct inode *inode = file_inode(iocb->ki_filp); | 3003 | struct inode *inode = file_inode(iocb->ki_filp); |
3068 | ext4_io_end_t *io_end = iocb->private; | 3004 | ext4_io_end_t *io_end = iocb->private; |
3069 | 3005 | ||
3070 | /* if not async direct IO or dio with 0 bytes write, just return */ | 3006 | /* if not async direct IO just return */ |
3071 | if (!io_end || !size) | 3007 | if (!io_end) { |
3072 | goto out; | 3008 | inode_dio_done(inode); |
3009 | if (is_async) | ||
3010 | aio_complete(iocb, ret, 0); | ||
3011 | return; | ||
3012 | } | ||
3073 | 3013 | ||
3074 | ext_debug("ext4_end_io_dio(): io_end 0x%p " | 3014 | ext_debug("ext4_end_io_dio(): io_end 0x%p " |
3075 | "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", | 3015 | "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", |
@@ -3077,25 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3077 | size); | 3017 | size); |
3078 | 3018 | ||
3079 | iocb->private = NULL; | 3019 | iocb->private = NULL; |
3080 | |||
3081 | /* if not aio dio with unwritten extents, just free io and return */ | ||
3082 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | ||
3083 | ext4_free_io_end(io_end); | ||
3084 | out: | ||
3085 | inode_dio_done(inode); | ||
3086 | if (is_async) | ||
3087 | aio_complete(iocb, ret, 0); | ||
3088 | return; | ||
3089 | } | ||
3090 | |||
3091 | io_end->offset = offset; | 3020 | io_end->offset = offset; |
3092 | io_end->size = size; | 3021 | io_end->size = size; |
3093 | if (is_async) { | 3022 | if (is_async) { |
3094 | io_end->iocb = iocb; | 3023 | io_end->iocb = iocb; |
3095 | io_end->result = ret; | 3024 | io_end->result = ret; |
3096 | } | 3025 | } |
3097 | 3026 | ext4_put_io_end_defer(io_end); | |
3098 | ext4_add_complete_io(io_end); | ||
3099 | } | 3027 | } |
3100 | 3028 | ||
3101 | /* | 3029 | /* |
@@ -3129,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3129 | get_block_t *get_block_func = NULL; | 3057 | get_block_t *get_block_func = NULL; |
3130 | int dio_flags = 0; | 3058 | int dio_flags = 0; |
3131 | loff_t final_size = offset + count; | 3059 | loff_t final_size = offset + count; |
3060 | ext4_io_end_t *io_end = NULL; | ||
3132 | 3061 | ||
3133 | /* Use the old path for reads and writes beyond i_size. */ | 3062 | /* Use the old path for reads and writes beyond i_size. */ |
3134 | if (rw != WRITE || final_size > inode->i_size) | 3063 | if (rw != WRITE || final_size > inode->i_size) |
@@ -3136,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3136 | 3065 | ||
3137 | BUG_ON(iocb->private == NULL); | 3066 | BUG_ON(iocb->private == NULL); |
3138 | 3067 | ||
3068 | /* | ||
3069 | * Make all waiters for direct IO properly wait also for extent | ||
3070 | * conversion. This also disallows race between truncate() and | ||
3071 | * overwrite DIO as i_dio_count needs to be incremented under i_mutex. | ||
3072 | */ | ||
3073 | if (rw == WRITE) | ||
3074 | atomic_inc(&inode->i_dio_count); | ||
3075 | |||
3139 | /* If we do a overwrite dio, i_mutex locking can be released */ | 3076 | /* If we do a overwrite dio, i_mutex locking can be released */ |
3140 | overwrite = *((int *)iocb->private); | 3077 | overwrite = *((int *)iocb->private); |
3141 | 3078 | ||
3142 | if (overwrite) { | 3079 | if (overwrite) { |
3143 | atomic_inc(&inode->i_dio_count); | ||
3144 | down_read(&EXT4_I(inode)->i_data_sem); | 3080 | down_read(&EXT4_I(inode)->i_data_sem); |
3145 | mutex_unlock(&inode->i_mutex); | 3081 | mutex_unlock(&inode->i_mutex); |
3146 | } | 3082 | } |
@@ -3167,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3167 | iocb->private = NULL; | 3103 | iocb->private = NULL; |
3168 | ext4_inode_aio_set(inode, NULL); | 3104 | ext4_inode_aio_set(inode, NULL); |
3169 | if (!is_sync_kiocb(iocb)) { | 3105 | if (!is_sync_kiocb(iocb)) { |
3170 | ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); | 3106 | io_end = ext4_init_io_end(inode, GFP_NOFS); |
3171 | if (!io_end) { | 3107 | if (!io_end) { |
3172 | ret = -ENOMEM; | 3108 | ret = -ENOMEM; |
3173 | goto retake_lock; | 3109 | goto retake_lock; |
3174 | } | 3110 | } |
3175 | io_end->flag |= EXT4_IO_END_DIRECT; | 3111 | io_end->flag |= EXT4_IO_END_DIRECT; |
3176 | iocb->private = io_end; | 3112 | /* |
3113 | * Grab reference for DIO. Will be dropped in ext4_end_io_dio() | ||
3114 | */ | ||
3115 | iocb->private = ext4_get_io_end(io_end); | ||
3177 | /* | 3116 | /* |
3178 | * we save the io structure for current async direct | 3117 | * we save the io structure for current async direct |
3179 | * IO, so that later ext4_map_blocks() could flag the | 3118 | * IO, so that later ext4_map_blocks() could flag the |
@@ -3197,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3197 | NULL, | 3136 | NULL, |
3198 | dio_flags); | 3137 | dio_flags); |
3199 | 3138 | ||
3200 | if (iocb->private) | ||
3201 | ext4_inode_aio_set(inode, NULL); | ||
3202 | /* | 3139 | /* |
3203 | * The io_end structure takes a reference to the inode, that | 3140 | * Put our reference to io_end. This can free the io_end structure e.g. |
3204 | * structure needs to be destroyed and the reference to the | 3141 | * in sync IO case or in case of error. It can even perform extent |
3205 | * inode need to be dropped, when IO is complete, even with 0 | 3142 | * conversion if all bios we submitted finished before we got here. |
3206 | * byte write, or failed. | 3143 | * Note that in that case iocb->private can be already set to NULL |
3207 | * | 3144 | * here. |
3208 | * In the successful AIO DIO case, the io_end structure will | ||
3209 | * be destroyed and the reference to the inode will be dropped | ||
3210 | * after the end_io call back function is called. | ||
3211 | * | ||
3212 | * In the case there is 0 byte write, or error case, since VFS | ||
3213 | * direct IO won't invoke the end_io call back function, we | ||
3214 | * need to free the end_io structure here. | ||
3215 | */ | 3145 | */ |
3216 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | 3146 | if (io_end) { |
3217 | ext4_free_io_end(iocb->private); | 3147 | ext4_inode_aio_set(inode, NULL); |
3218 | iocb->private = NULL; | 3148 | ext4_put_io_end(io_end); |
3219 | } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, | 3149 | /* |
3150 | * When no IO was submitted ext4_end_io_dio() was not | ||
3151 | * called so we have to put iocb's reference. | ||
3152 | */ | ||
3153 | if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { | ||
3154 | WARN_ON(iocb->private != io_end); | ||
3155 | WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); | ||
3156 | WARN_ON(io_end->iocb); | ||
3157 | /* | ||
3158 | * Generic code already did inode_dio_done() so we | ||
3159 | * have to clear EXT4_IO_END_DIRECT to not do it for | ||
3160 | * the second time. | ||
3161 | */ | ||
3162 | io_end->flag = 0; | ||
3163 | ext4_put_io_end(io_end); | ||
3164 | iocb->private = NULL; | ||
3165 | } | ||
3166 | } | ||
3167 | if (ret > 0 && !overwrite && ext4_test_inode_state(inode, | ||
3220 | EXT4_STATE_DIO_UNWRITTEN)) { | 3168 | EXT4_STATE_DIO_UNWRITTEN)) { |
3221 | int err; | 3169 | int err; |
3222 | /* | 3170 | /* |
3223 | * for non AIO case, since the IO is already | 3171 | * for non AIO case, since the IO is already |
3224 | * completed, we could do the conversion right here | 3172 | * completed, we could do the conversion right here |
3225 | */ | 3173 | */ |
3226 | err = ext4_convert_unwritten_extents(inode, | 3174 | err = ext4_convert_unwritten_extents(NULL, inode, |
3227 | offset, ret); | 3175 | offset, ret); |
3228 | if (err < 0) | 3176 | if (err < 0) |
3229 | ret = err; | 3177 | ret = err; |
@@ -3231,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3231 | } | 3179 | } |
3232 | 3180 | ||
3233 | retake_lock: | 3181 | retake_lock: |
3182 | if (rw == WRITE) | ||
3183 | inode_dio_done(inode); | ||
3234 | /* take i_mutex locking again if we do a ovewrite dio */ | 3184 | /* take i_mutex locking again if we do a ovewrite dio */ |
3235 | if (overwrite) { | 3185 | if (overwrite) { |
3236 | inode_dio_done(inode); | ||
3237 | up_read(&EXT4_I(inode)->i_data_sem); | 3186 | up_read(&EXT4_I(inode)->i_data_sem); |
3238 | mutex_lock(&inode->i_mutex); | 3187 | mutex_lock(&inode->i_mutex); |
3239 | } | 3188 | } |
@@ -3292,6 +3241,7 @@ static const struct address_space_operations ext4_aops = { | |||
3292 | .readpage = ext4_readpage, | 3241 | .readpage = ext4_readpage, |
3293 | .readpages = ext4_readpages, | 3242 | .readpages = ext4_readpages, |
3294 | .writepage = ext4_writepage, | 3243 | .writepage = ext4_writepage, |
3244 | .writepages = ext4_writepages, | ||
3295 | .write_begin = ext4_write_begin, | 3245 | .write_begin = ext4_write_begin, |
3296 | .write_end = ext4_write_end, | 3246 | .write_end = ext4_write_end, |
3297 | .bmap = ext4_bmap, | 3247 | .bmap = ext4_bmap, |
@@ -3307,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = { | |||
3307 | .readpage = ext4_readpage, | 3257 | .readpage = ext4_readpage, |
3308 | .readpages = ext4_readpages, | 3258 | .readpages = ext4_readpages, |
3309 | .writepage = ext4_writepage, | 3259 | .writepage = ext4_writepage, |
3260 | .writepages = ext4_writepages, | ||
3310 | .write_begin = ext4_write_begin, | 3261 | .write_begin = ext4_write_begin, |
3311 | .write_end = ext4_journalled_write_end, | 3262 | .write_end = ext4_journalled_write_end, |
3312 | .set_page_dirty = ext4_journalled_set_page_dirty, | 3263 | .set_page_dirty = ext4_journalled_set_page_dirty, |
@@ -3322,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = { | |||
3322 | .readpage = ext4_readpage, | 3273 | .readpage = ext4_readpage, |
3323 | .readpages = ext4_readpages, | 3274 | .readpages = ext4_readpages, |
3324 | .writepage = ext4_writepage, | 3275 | .writepage = ext4_writepage, |
3325 | .writepages = ext4_da_writepages, | 3276 | .writepages = ext4_writepages, |
3326 | .write_begin = ext4_da_write_begin, | 3277 | .write_begin = ext4_da_write_begin, |
3327 | .write_end = ext4_da_write_end, | 3278 | .write_end = ext4_da_write_end, |
3328 | .bmap = ext4_bmap, | 3279 | .bmap = ext4_bmap, |
@@ -3355,89 +3306,56 @@ void ext4_set_aops(struct inode *inode) | |||
3355 | inode->i_mapping->a_ops = &ext4_aops; | 3306 | inode->i_mapping->a_ops = &ext4_aops; |
3356 | } | 3307 | } |
3357 | 3308 | ||
3358 | |||
3359 | /* | 3309 | /* |
3360 | * ext4_discard_partial_page_buffers() | 3310 | * ext4_block_truncate_page() zeroes out a mapping from file offset `from' |
3361 | * Wrapper function for ext4_discard_partial_page_buffers_no_lock. | 3311 | * up to the end of the block which corresponds to `from'. |
3362 | * This function finds and locks the page containing the offset | 3312 | * This required during truncate. We need to physically zero the tail end |
3363 | * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. | 3313 | * of that block so it doesn't yield old data if the file is later grown. |
3364 | * Calling functions that already have the page locked should call | ||
3365 | * ext4_discard_partial_page_buffers_no_lock directly. | ||
3366 | */ | 3314 | */ |
3367 | int ext4_discard_partial_page_buffers(handle_t *handle, | 3315 | int ext4_block_truncate_page(handle_t *handle, |
3368 | struct address_space *mapping, loff_t from, | 3316 | struct address_space *mapping, loff_t from) |
3369 | loff_t length, int flags) | ||
3370 | { | 3317 | { |
3318 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | ||
3319 | unsigned length; | ||
3320 | unsigned blocksize; | ||
3371 | struct inode *inode = mapping->host; | 3321 | struct inode *inode = mapping->host; |
3372 | struct page *page; | ||
3373 | int err = 0; | ||
3374 | 3322 | ||
3375 | page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, | 3323 | blocksize = inode->i_sb->s_blocksize; |
3376 | mapping_gfp_mask(mapping) & ~__GFP_FS); | 3324 | length = blocksize - (offset & (blocksize - 1)); |
3377 | if (!page) | ||
3378 | return -ENOMEM; | ||
3379 | |||
3380 | err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, | ||
3381 | from, length, flags); | ||
3382 | 3325 | ||
3383 | unlock_page(page); | 3326 | return ext4_block_zero_page_range(handle, mapping, from, length); |
3384 | page_cache_release(page); | ||
3385 | return err; | ||
3386 | } | 3327 | } |
3387 | 3328 | ||
3388 | /* | 3329 | /* |
3389 | * ext4_discard_partial_page_buffers_no_lock() | 3330 | * ext4_block_zero_page_range() zeros out a mapping of length 'length' |
3390 | * Zeros a page range of length 'length' starting from offset 'from'. | 3331 | * starting from file offset 'from'. The range to be zero'd must |
3391 | * Buffer heads that correspond to the block aligned regions of the | 3332 | * be contained with in one block. If the specified range exceeds |
3392 | * zeroed range will be unmapped. Unblock aligned regions | 3333 | * the end of the block it will be shortened to end of the block |
3393 | * will have the corresponding buffer head mapped if needed so that | 3334 | * that cooresponds to 'from' |
3394 | * that region of the page can be updated with the partial zero out. | ||
3395 | * | ||
3396 | * This function assumes that the page has already been locked. The | ||
3397 | * The range to be discarded must be contained with in the given page. | ||
3398 | * If the specified range exceeds the end of the page it will be shortened | ||
3399 | * to the end of the page that corresponds to 'from'. This function is | ||
3400 | * appropriate for updating a page and it buffer heads to be unmapped and | ||
3401 | * zeroed for blocks that have been either released, or are going to be | ||
3402 | * released. | ||
3403 | * | ||
3404 | * handle: The journal handle | ||
3405 | * inode: The files inode | ||
3406 | * page: A locked page that contains the offset "from" | ||
3407 | * from: The starting byte offset (from the beginning of the file) | ||
3408 | * to begin discarding | ||
3409 | * len: The length of bytes to discard | ||
3410 | * flags: Optional flags that may be used: | ||
3411 | * | ||
3412 | * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED | ||
3413 | * Only zero the regions of the page whose buffer heads | ||
3414 | * have already been unmapped. This flag is appropriate | ||
3415 | * for updating the contents of a page whose blocks may | ||
3416 | * have already been released, and we only want to zero | ||
3417 | * out the regions that correspond to those released blocks. | ||
3418 | * | ||
3419 | * Returns zero on success or negative on failure. | ||
3420 | */ | 3335 | */ |
3421 | static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, | 3336 | int ext4_block_zero_page_range(handle_t *handle, |
3422 | struct inode *inode, struct page *page, loff_t from, | 3337 | struct address_space *mapping, loff_t from, loff_t length) |
3423 | loff_t length, int flags) | ||
3424 | { | 3338 | { |
3425 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; | 3339 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; |
3426 | unsigned int offset = from & (PAGE_CACHE_SIZE-1); | 3340 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
3427 | unsigned int blocksize, max, pos; | 3341 | unsigned blocksize, max, pos; |
3428 | ext4_lblk_t iblock; | 3342 | ext4_lblk_t iblock; |
3343 | struct inode *inode = mapping->host; | ||
3429 | struct buffer_head *bh; | 3344 | struct buffer_head *bh; |
3345 | struct page *page; | ||
3430 | int err = 0; | 3346 | int err = 0; |
3431 | 3347 | ||
3432 | blocksize = inode->i_sb->s_blocksize; | 3348 | page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, |
3433 | max = PAGE_CACHE_SIZE - offset; | 3349 | mapping_gfp_mask(mapping) & ~__GFP_FS); |
3350 | if (!page) | ||
3351 | return -ENOMEM; | ||
3434 | 3352 | ||
3435 | if (index != page->index) | 3353 | blocksize = inode->i_sb->s_blocksize; |
3436 | return -EINVAL; | 3354 | max = blocksize - (offset & (blocksize - 1)); |
3437 | 3355 | ||
3438 | /* | 3356 | /* |
3439 | * correct length if it does not fall between | 3357 | * correct length if it does not fall between |
3440 | * 'from' and the end of the page | 3358 | * 'from' and the end of the block |
3441 | */ | 3359 | */ |
3442 | if (length > max || length < 0) | 3360 | if (length > max || length < 0) |
3443 | length = max; | 3361 | length = max; |
@@ -3455,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, | |||
3455 | iblock++; | 3373 | iblock++; |
3456 | pos += blocksize; | 3374 | pos += blocksize; |
3457 | } | 3375 | } |
3458 | 3376 | if (buffer_freed(bh)) { | |
3459 | pos = offset; | 3377 | BUFFER_TRACE(bh, "freed: skip"); |
3460 | while (pos < offset + length) { | 3378 | goto unlock; |
3461 | unsigned int end_of_block, range_to_discard; | 3379 | } |
3462 | 3380 | if (!buffer_mapped(bh)) { | |
3463 | err = 0; | 3381 | BUFFER_TRACE(bh, "unmapped"); |
3464 | 3382 | ext4_get_block(inode, iblock, bh, 0); | |
3465 | /* The length of space left to zero and unmap */ | 3383 | /* unmapped? It's a hole - nothing to do */ |
3466 | range_to_discard = offset + length - pos; | ||
3467 | |||
3468 | /* The length of space until the end of the block */ | ||
3469 | end_of_block = blocksize - (pos & (blocksize-1)); | ||
3470 | |||
3471 | /* | ||
3472 | * Do not unmap or zero past end of block | ||
3473 | * for this buffer head | ||
3474 | */ | ||
3475 | if (range_to_discard > end_of_block) | ||
3476 | range_to_discard = end_of_block; | ||
3477 | |||
3478 | |||
3479 | /* | ||
3480 | * Skip this buffer head if we are only zeroing unampped | ||
3481 | * regions of the page | ||
3482 | */ | ||
3483 | if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && | ||
3484 | buffer_mapped(bh)) | ||
3485 | goto next; | ||
3486 | |||
3487 | /* If the range is block aligned, unmap */ | ||
3488 | if (range_to_discard == blocksize) { | ||
3489 | clear_buffer_dirty(bh); | ||
3490 | bh->b_bdev = NULL; | ||
3491 | clear_buffer_mapped(bh); | ||
3492 | clear_buffer_req(bh); | ||
3493 | clear_buffer_new(bh); | ||
3494 | clear_buffer_delay(bh); | ||
3495 | clear_buffer_unwritten(bh); | ||
3496 | clear_buffer_uptodate(bh); | ||
3497 | zero_user(page, pos, range_to_discard); | ||
3498 | BUFFER_TRACE(bh, "Buffer discarded"); | ||
3499 | goto next; | ||
3500 | } | ||
3501 | |||
3502 | /* | ||
3503 | * If this block is not completely contained in the range | ||
3504 | * to be discarded, then it is not going to be released. Because | ||
3505 | * we need to keep this block, we need to make sure this part | ||
3506 | * of the page is uptodate before we modify it by writeing | ||
3507 | * partial zeros on it. | ||
3508 | */ | ||
3509 | if (!buffer_mapped(bh)) { | 3384 | if (!buffer_mapped(bh)) { |
3510 | /* | 3385 | BUFFER_TRACE(bh, "still unmapped"); |
3511 | * Buffer head must be mapped before we can read | 3386 | goto unlock; |
3512 | * from the block | ||
3513 | */ | ||
3514 | BUFFER_TRACE(bh, "unmapped"); | ||
3515 | ext4_get_block(inode, iblock, bh, 0); | ||
3516 | /* unmapped? It's a hole - nothing to do */ | ||
3517 | if (!buffer_mapped(bh)) { | ||
3518 | BUFFER_TRACE(bh, "still unmapped"); | ||
3519 | goto next; | ||
3520 | } | ||
3521 | } | 3387 | } |
3388 | } | ||
3522 | 3389 | ||
3523 | /* Ok, it's mapped. Make sure it's up-to-date */ | 3390 | /* Ok, it's mapped. Make sure it's up-to-date */ |
3524 | if (PageUptodate(page)) | 3391 | if (PageUptodate(page)) |
3525 | set_buffer_uptodate(bh); | 3392 | set_buffer_uptodate(bh); |
3526 | 3393 | ||
3527 | if (!buffer_uptodate(bh)) { | 3394 | if (!buffer_uptodate(bh)) { |
3528 | err = -EIO; | 3395 | err = -EIO; |
3529 | ll_rw_block(READ, 1, &bh); | 3396 | ll_rw_block(READ, 1, &bh); |
3530 | wait_on_buffer(bh); | 3397 | wait_on_buffer(bh); |
3531 | /* Uhhuh. Read error. Complain and punt.*/ | 3398 | /* Uhhuh. Read error. Complain and punt. */ |
3532 | if (!buffer_uptodate(bh)) | 3399 | if (!buffer_uptodate(bh)) |
3533 | goto next; | 3400 | goto unlock; |
3534 | } | 3401 | } |
3402 | if (ext4_should_journal_data(inode)) { | ||
3403 | BUFFER_TRACE(bh, "get write access"); | ||
3404 | err = ext4_journal_get_write_access(handle, bh); | ||
3405 | if (err) | ||
3406 | goto unlock; | ||
3407 | } | ||
3408 | zero_user(page, offset, length); | ||
3409 | BUFFER_TRACE(bh, "zeroed end of block"); | ||
3535 | 3410 | ||
3536 | if (ext4_should_journal_data(inode)) { | 3411 | if (ext4_should_journal_data(inode)) { |
3537 | BUFFER_TRACE(bh, "get write access"); | 3412 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
3538 | err = ext4_journal_get_write_access(handle, bh); | 3413 | } else { |
3539 | if (err) | 3414 | err = 0; |
3540 | goto next; | 3415 | mark_buffer_dirty(bh); |
3541 | } | 3416 | if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) |
3417 | err = ext4_jbd2_file_inode(handle, inode); | ||
3418 | } | ||
3419 | |||
3420 | unlock: | ||
3421 | unlock_page(page); | ||
3422 | page_cache_release(page); | ||
3423 | return err; | ||
3424 | } | ||
3542 | 3425 | ||
3543 | zero_user(page, pos, range_to_discard); | 3426 | int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, |
3427 | loff_t lstart, loff_t length) | ||
3428 | { | ||
3429 | struct super_block *sb = inode->i_sb; | ||
3430 | struct address_space *mapping = inode->i_mapping; | ||
3431 | unsigned partial_start, partial_end; | ||
3432 | ext4_fsblk_t start, end; | ||
3433 | loff_t byte_end = (lstart + length - 1); | ||
3434 | int err = 0; | ||
3544 | 3435 | ||
3545 | err = 0; | 3436 | partial_start = lstart & (sb->s_blocksize - 1); |
3546 | if (ext4_should_journal_data(inode)) { | 3437 | partial_end = byte_end & (sb->s_blocksize - 1); |
3547 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
3548 | } else | ||
3549 | mark_buffer_dirty(bh); | ||
3550 | 3438 | ||
3551 | BUFFER_TRACE(bh, "Partial buffer zeroed"); | 3439 | start = lstart >> sb->s_blocksize_bits; |
3552 | next: | 3440 | end = byte_end >> sb->s_blocksize_bits; |
3553 | bh = bh->b_this_page; | ||
3554 | iblock++; | ||
3555 | pos += range_to_discard; | ||
3556 | } | ||
3557 | 3441 | ||
3442 | /* Handle partial zero within the single block */ | ||
3443 | if (start == end && | ||
3444 | (partial_start || (partial_end != sb->s_blocksize - 1))) { | ||
3445 | err = ext4_block_zero_page_range(handle, mapping, | ||
3446 | lstart, length); | ||
3447 | return err; | ||
3448 | } | ||
3449 | /* Handle partial zero out on the start of the range */ | ||
3450 | if (partial_start) { | ||
3451 | err = ext4_block_zero_page_range(handle, mapping, | ||
3452 | lstart, sb->s_blocksize); | ||
3453 | if (err) | ||
3454 | return err; | ||
3455 | } | ||
3456 | /* Handle partial zero out on the end of the range */ | ||
3457 | if (partial_end != sb->s_blocksize - 1) | ||
3458 | err = ext4_block_zero_page_range(handle, mapping, | ||
3459 | byte_end - partial_end, | ||
3460 | partial_end + 1); | ||
3558 | return err; | 3461 | return err; |
3559 | } | 3462 | } |
3560 | 3463 | ||
@@ -3580,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode) | |||
3580 | * Returns: 0 on success or negative on failure | 3483 | * Returns: 0 on success or negative on failure |
3581 | */ | 3484 | */ |
3582 | 3485 | ||
3583 | int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | 3486 | int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) |
3584 | { | 3487 | { |
3585 | struct inode *inode = file_inode(file); | ||
3586 | struct super_block *sb = inode->i_sb; | 3488 | struct super_block *sb = inode->i_sb; |
3587 | ext4_lblk_t first_block, stop_block; | 3489 | ext4_lblk_t first_block, stop_block; |
3588 | struct address_space *mapping = inode->i_mapping; | 3490 | struct address_space *mapping = inode->i_mapping; |
3589 | loff_t first_page, last_page, page_len; | 3491 | loff_t first_block_offset, last_block_offset; |
3590 | loff_t first_page_offset, last_page_offset; | ||
3591 | handle_t *handle; | 3492 | handle_t *handle; |
3592 | unsigned int credits; | 3493 | unsigned int credits; |
3593 | int ret = 0; | 3494 | int ret = 0; |
@@ -3638,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
3638 | offset; | 3539 | offset; |
3639 | } | 3540 | } |
3640 | 3541 | ||
3641 | first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 3542 | first_block_offset = round_up(offset, sb->s_blocksize); |
3642 | last_page = (offset + length) >> PAGE_CACHE_SHIFT; | 3543 | last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; |
3643 | 3544 | ||
3644 | first_page_offset = first_page << PAGE_CACHE_SHIFT; | 3545 | /* Now release the pages and zero block aligned part of pages*/ |
3645 | last_page_offset = last_page << PAGE_CACHE_SHIFT; | 3546 | if (last_block_offset > first_block_offset) |
3646 | 3547 | truncate_pagecache_range(inode, first_block_offset, | |
3647 | /* Now release the pages */ | 3548 | last_block_offset); |
3648 | if (last_page_offset > first_page_offset) { | ||
3649 | truncate_pagecache_range(inode, first_page_offset, | ||
3650 | last_page_offset - 1); | ||
3651 | } | ||
3652 | 3549 | ||
3653 | /* Wait all existing dio workers, newcomers will block on i_mutex */ | 3550 | /* Wait all existing dio workers, newcomers will block on i_mutex */ |
3654 | ext4_inode_block_unlocked_dio(inode); | 3551 | ext4_inode_block_unlocked_dio(inode); |
3655 | ret = ext4_flush_unwritten_io(inode); | ||
3656 | if (ret) | ||
3657 | goto out_dio; | ||
3658 | inode_dio_wait(inode); | 3552 | inode_dio_wait(inode); |
3659 | 3553 | ||
3660 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3554 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
@@ -3668,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
3668 | goto out_dio; | 3562 | goto out_dio; |
3669 | } | 3563 | } |
3670 | 3564 | ||
3671 | /* | 3565 | ret = ext4_zero_partial_blocks(handle, inode, offset, |
3672 | * Now we need to zero out the non-page-aligned data in the | 3566 | length); |
3673 | * pages at the start and tail of the hole, and unmap the | 3567 | if (ret) |
3674 | * buffer heads for the block aligned regions of the page that | 3568 | goto out_stop; |
3675 | * were completely zeroed. | ||
3676 | */ | ||
3677 | if (first_page > last_page) { | ||
3678 | /* | ||
3679 | * If the file space being truncated is contained | ||
3680 | * within a page just zero out and unmap the middle of | ||
3681 | * that page | ||
3682 | */ | ||
3683 | ret = ext4_discard_partial_page_buffers(handle, | ||
3684 | mapping, offset, length, 0); | ||
3685 | |||
3686 | if (ret) | ||
3687 | goto out_stop; | ||
3688 | } else { | ||
3689 | /* | ||
3690 | * zero out and unmap the partial page that contains | ||
3691 | * the start of the hole | ||
3692 | */ | ||
3693 | page_len = first_page_offset - offset; | ||
3694 | if (page_len > 0) { | ||
3695 | ret = ext4_discard_partial_page_buffers(handle, mapping, | ||
3696 | offset, page_len, 0); | ||
3697 | if (ret) | ||
3698 | goto out_stop; | ||
3699 | } | ||
3700 | |||
3701 | /* | ||
3702 | * zero out and unmap the partial page that contains | ||
3703 | * the end of the hole | ||
3704 | */ | ||
3705 | page_len = offset + length - last_page_offset; | ||
3706 | if (page_len > 0) { | ||
3707 | ret = ext4_discard_partial_page_buffers(handle, mapping, | ||
3708 | last_page_offset, page_len, 0); | ||
3709 | if (ret) | ||
3710 | goto out_stop; | ||
3711 | } | ||
3712 | } | ||
3713 | |||
3714 | /* | ||
3715 | * If i_size is contained in the last page, we need to | ||
3716 | * unmap and zero the partial page after i_size | ||
3717 | */ | ||
3718 | if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && | ||
3719 | inode->i_size % PAGE_CACHE_SIZE != 0) { | ||
3720 | page_len = PAGE_CACHE_SIZE - | ||
3721 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
3722 | |||
3723 | if (page_len > 0) { | ||
3724 | ret = ext4_discard_partial_page_buffers(handle, | ||
3725 | mapping, inode->i_size, page_len, 0); | ||
3726 | |||
3727 | if (ret) | ||
3728 | goto out_stop; | ||
3729 | } | ||
3730 | } | ||
3731 | 3569 | ||
3732 | first_block = (offset + sb->s_blocksize - 1) >> | 3570 | first_block = (offset + sb->s_blocksize - 1) >> |
3733 | EXT4_BLOCK_SIZE_BITS(sb); | 3571 | EXT4_BLOCK_SIZE_BITS(sb); |
@@ -3803,7 +3641,6 @@ void ext4_truncate(struct inode *inode) | |||
3803 | unsigned int credits; | 3641 | unsigned int credits; |
3804 | handle_t *handle; | 3642 | handle_t *handle; |
3805 | struct address_space *mapping = inode->i_mapping; | 3643 | struct address_space *mapping = inode->i_mapping; |
3806 | loff_t page_len; | ||
3807 | 3644 | ||
3808 | /* | 3645 | /* |
3809 | * There is a possibility that we're either freeing the inode | 3646 | * There is a possibility that we're either freeing the inode |
@@ -3830,12 +3667,6 @@ void ext4_truncate(struct inode *inode) | |||
3830 | return; | 3667 | return; |
3831 | } | 3668 | } |
3832 | 3669 | ||
3833 | /* | ||
3834 | * finish any pending end_io work so we won't run the risk of | ||
3835 | * converting any truncated blocks to initialized later | ||
3836 | */ | ||
3837 | ext4_flush_unwritten_io(inode); | ||
3838 | |||
3839 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3670 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
3840 | credits = ext4_writepage_trans_blocks(inode); | 3671 | credits = ext4_writepage_trans_blocks(inode); |
3841 | else | 3672 | else |
@@ -3847,14 +3678,8 @@ void ext4_truncate(struct inode *inode) | |||
3847 | return; | 3678 | return; |
3848 | } | 3679 | } |
3849 | 3680 | ||
3850 | if (inode->i_size % PAGE_CACHE_SIZE != 0) { | 3681 | if (inode->i_size & (inode->i_sb->s_blocksize - 1)) |
3851 | page_len = PAGE_CACHE_SIZE - | 3682 | ext4_block_truncate_page(handle, mapping, inode->i_size); |
3852 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
3853 | |||
3854 | if (ext4_discard_partial_page_buffers(handle, | ||
3855 | mapping, inode->i_size, page_len, 0)) | ||
3856 | goto out_stop; | ||
3857 | } | ||
3858 | 3683 | ||
3859 | /* | 3684 | /* |
3860 | * We add the inode to the orphan list, so that if this | 3685 | * We add the inode to the orphan list, so that if this |
@@ -4623,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) | |||
4623 | inode->i_size >> PAGE_CACHE_SHIFT); | 4448 | inode->i_size >> PAGE_CACHE_SHIFT); |
4624 | if (!page) | 4449 | if (!page) |
4625 | return; | 4450 | return; |
4626 | ret = __ext4_journalled_invalidatepage(page, offset); | 4451 | ret = __ext4_journalled_invalidatepage(page, offset, |
4452 | PAGE_CACHE_SIZE - offset); | ||
4627 | unlock_page(page); | 4453 | unlock_page(page); |
4628 | page_cache_release(page); | 4454 | page_cache_release(page); |
4629 | if (ret != -EBUSY) | 4455 | if (ret != -EBUSY) |
@@ -4805,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
4805 | struct kstat *stat) | 4631 | struct kstat *stat) |
4806 | { | 4632 | { |
4807 | struct inode *inode; | 4633 | struct inode *inode; |
4808 | unsigned long delalloc_blocks; | 4634 | unsigned long long delalloc_blocks; |
4809 | 4635 | ||
4810 | inode = dentry->d_inode; | 4636 | inode = dentry->d_inode; |
4811 | generic_fillattr(inode, stat); | 4637 | generic_fillattr(inode, stat); |
@@ -4823,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
4823 | delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), | 4649 | delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), |
4824 | EXT4_I(inode)->i_reserved_data_blocks); | 4650 | EXT4_I(inode)->i_reserved_data_blocks); |
4825 | 4651 | ||
4826 | stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; | 4652 | stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9); |
4827 | return 0; | 4653 | return 0; |
4828 | } | 4654 | } |
4829 | 4655 | ||
4830 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 4656 | static int ext4_index_trans_blocks(struct inode *inode, int lblocks, |
4657 | int pextents) | ||
4831 | { | 4658 | { |
4832 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 4659 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
4833 | return ext4_ind_trans_blocks(inode, nrblocks, chunk); | 4660 | return ext4_ind_trans_blocks(inode, lblocks); |
4834 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); | 4661 | return ext4_ext_index_trans_blocks(inode, pextents); |
4835 | } | 4662 | } |
4836 | 4663 | ||
4837 | /* | 4664 | /* |
@@ -4845,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
4845 | * | 4672 | * |
4846 | * Also account for superblock, inode, quota and xattr blocks | 4673 | * Also account for superblock, inode, quota and xattr blocks |
4847 | */ | 4674 | */ |
4848 | static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 4675 | static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, |
4676 | int pextents) | ||
4849 | { | 4677 | { |
4850 | ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); | 4678 | ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); |
4851 | int gdpblocks; | 4679 | int gdpblocks; |
@@ -4853,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
4853 | int ret = 0; | 4681 | int ret = 0; |
4854 | 4682 | ||
4855 | /* | 4683 | /* |
4856 | * How many index blocks need to touch to modify nrblocks? | 4684 | * How many index blocks need to touch to map @lblocks logical blocks |
4857 | * The "Chunk" flag indicating whether the nrblocks is | 4685 | * to @pextents physical extents? |
4858 | * physically contiguous on disk | ||
4859 | * | ||
4860 | * For Direct IO and fallocate, they calls get_block to allocate | ||
4861 | * one single extent at a time, so they could set the "Chunk" flag | ||
4862 | */ | 4686 | */ |
4863 | idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); | 4687 | idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); |
4864 | 4688 | ||
4865 | ret = idxblocks; | 4689 | ret = idxblocks; |
4866 | 4690 | ||
@@ -4868,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
4868 | * Now let's see how many group bitmaps and group descriptors need | 4692 | * Now let's see how many group bitmaps and group descriptors need |
4869 | * to account | 4693 | * to account |
4870 | */ | 4694 | */ |
4871 | groups = idxblocks; | 4695 | groups = idxblocks + pextents; |
4872 | if (chunk) | ||
4873 | groups += 1; | ||
4874 | else | ||
4875 | groups += nrblocks; | ||
4876 | |||
4877 | gdpblocks = groups; | 4696 | gdpblocks = groups; |
4878 | if (groups > ngroups) | 4697 | if (groups > ngroups) |
4879 | groups = ngroups; | 4698 | groups = ngroups; |
@@ -4904,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) | |||
4904 | int bpp = ext4_journal_blocks_per_page(inode); | 4723 | int bpp = ext4_journal_blocks_per_page(inode); |
4905 | int ret; | 4724 | int ret; |
4906 | 4725 | ||
4907 | ret = ext4_meta_trans_blocks(inode, bpp, 0); | 4726 | ret = ext4_meta_trans_blocks(inode, bpp, bpp); |
4908 | 4727 | ||
4909 | /* Account for data blocks for journalled mode */ | 4728 | /* Account for data blocks for journalled mode */ |
4910 | if (ext4_should_journal_data(inode)) | 4729 | if (ext4_should_journal_data(inode)) |