diff options
author | Jan Kara <jack@suse.cz> | 2008-07-11 19:27:31 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-07-11 19:27:31 -0400 |
commit | 678aaf481496b01473b778685eca231d6784098b (patch) | |
tree | 298fa039e4910a0ead3cdfb814af167f378391bc /fs | |
parent | c851ed540173736e60d48b53b91a16ea5c903896 (diff) |
ext4: Use new framework for data=ordered mode in JBD2
This patch makes ext4 use inode-based implementation of data=ordered mode
in JBD2. It allows us to unify some data=ordered and data=writeback paths
(especially writepage since we don't have to start a transaction anymore)
and remove some buffer walking.
Updated fix from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
to fix file system hang due to corrupt jinode values.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext4/ext4_i.h | 1 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 7 | ||||
-rw-r--r-- | fs/ext4/inode.c | 158 | ||||
-rw-r--r-- | fs/ext4/super.c | 5 |
4 files changed, 59 insertions, 112 deletions
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index abf2744164e0..c2903ef72159 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h | |||
@@ -150,6 +150,7 @@ struct ext4_inode_info { | |||
150 | */ | 150 | */ |
151 | struct rw_semaphore i_data_sem; | 151 | struct rw_semaphore i_data_sem; |
152 | struct inode vfs_inode; | 152 | struct inode vfs_inode; |
153 | struct jbd2_inode jinode; | ||
153 | 154 | ||
154 | unsigned long i_ext_generation; | 155 | unsigned long i_ext_generation; |
155 | struct ext4_ext_cache i_cached_extent; | 156 | struct ext4_ext_cache i_cached_extent; |
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index d0aa9ee20f88..eb8bc3afe6e9 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h | |||
@@ -154,8 +154,6 @@ int __ext4_journal_dirty_metadata(const char *where, | |||
154 | #define ext4_journal_forget(handle, bh) \ | 154 | #define ext4_journal_forget(handle, bh) \ |
155 | __ext4_journal_forget(__func__, (handle), (bh)) | 155 | __ext4_journal_forget(__func__, (handle), (bh)) |
156 | 156 | ||
157 | int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh); | ||
158 | |||
159 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); | 157 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); |
160 | int __ext4_journal_stop(const char *where, handle_t *handle); | 158 | int __ext4_journal_stop(const char *where, handle_t *handle); |
161 | 159 | ||
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal) | |||
192 | return jbd2_journal_force_commit(journal); | 190 | return jbd2_journal_force_commit(journal); |
193 | } | 191 | } |
194 | 192 | ||
193 | static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) | ||
194 | { | ||
195 | return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); | ||
196 | } | ||
197 | |||
195 | /* super.c */ | 198 | /* super.c */ |
196 | int ext4_force_commit(struct super_block *sb); | 199 | int ext4_force_commit(struct super_block *sb); |
197 | 200 | ||
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 320acb6c35bf..7b9569179fdf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -39,6 +39,13 @@ | |||
39 | #include "xattr.h" | 39 | #include "xattr.h" |
40 | #include "acl.h" | 40 | #include "acl.h" |
41 | 41 | ||
42 | static inline int ext4_begin_ordered_truncate(struct inode *inode, | ||
43 | loff_t new_size) | ||
44 | { | ||
45 | return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, | ||
46 | new_size); | ||
47 | } | ||
48 | |||
42 | /* | 49 | /* |
43 | * Test whether an inode is a fast symlink. | 50 | * Test whether an inode is a fast symlink. |
44 | */ | 51 | */ |
@@ -181,6 +188,8 @@ void ext4_delete_inode (struct inode * inode) | |||
181 | { | 188 | { |
182 | handle_t *handle; | 189 | handle_t *handle; |
183 | 190 | ||
191 | if (ext4_should_order_data(inode)) | ||
192 | ext4_begin_ordered_truncate(inode, 0); | ||
184 | truncate_inode_pages(&inode->i_data, 0); | 193 | truncate_inode_pages(&inode->i_data, 0); |
185 | 194 | ||
186 | if (is_bad_inode(inode)) | 195 | if (is_bad_inode(inode)) |
@@ -1273,15 +1282,6 @@ out: | |||
1273 | return ret; | 1282 | return ret; |
1274 | } | 1283 | } |
1275 | 1284 | ||
1276 | int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) | ||
1277 | { | ||
1278 | int err = jbd2_journal_dirty_data(handle, bh); | ||
1279 | if (err) | ||
1280 | ext4_journal_abort_handle(__func__, __func__, | ||
1281 | bh, handle, err); | ||
1282 | return err; | ||
1283 | } | ||
1284 | |||
1285 | /* For write_end() in data=journal mode */ | 1285 | /* For write_end() in data=journal mode */ |
1286 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) | 1286 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) |
1287 | { | 1287 | { |
@@ -1311,8 +1311,7 @@ static int ext4_ordered_write_end(struct file *file, | |||
1311 | from = pos & (PAGE_CACHE_SIZE - 1); | 1311 | from = pos & (PAGE_CACHE_SIZE - 1); |
1312 | to = from + len; | 1312 | to = from + len; |
1313 | 1313 | ||
1314 | ret = walk_page_buffers(handle, page_buffers(page), | 1314 | ret = ext4_jbd2_file_inode(handle, inode); |
1315 | from, to, NULL, ext4_journal_dirty_data); | ||
1316 | 1315 | ||
1317 | if (ret == 0) { | 1316 | if (ret == 0) { |
1318 | /* | 1317 | /* |
@@ -1472,25 +1471,22 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) | |||
1472 | return 0; | 1471 | return 0; |
1473 | } | 1472 | } |
1474 | 1473 | ||
1475 | static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | ||
1476 | { | ||
1477 | if (buffer_mapped(bh)) | ||
1478 | return ext4_journal_dirty_data(handle, bh); | ||
1479 | return 0; | ||
1480 | } | ||
1481 | |||
1482 | static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | 1474 | static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) |
1483 | { | 1475 | { |
1484 | return !buffer_mapped(bh) || buffer_delay(bh); | 1476 | return !buffer_mapped(bh) || buffer_delay(bh); |
1485 | } | 1477 | } |
1486 | 1478 | ||
1487 | /* | 1479 | /* |
1488 | * Note that we don't need to start a transaction unless we're journaling | 1480 | * Note that we don't need to start a transaction unless we're journaling data |
1489 | * data because we should have holes filled from ext4_page_mkwrite(). If | 1481 | * because we should have holes filled from ext4_page_mkwrite(). We even don't |
1490 | * we are journaling data, we cannot start transaction directly because | 1482 | * need to file the inode to the transaction's list in ordered mode because if |
1491 | * transaction start ranks above page lock so we have to do some magic... | 1483 | * we are writing back data added by write(), the inode is already there and if |
1484 | * we are writing back data modified via mmap(), noone guarantees in which | ||
1485 | * transaction the data will hit the disk. In case we are journaling data, we | ||
1486 | * cannot start transaction directly because transaction start ranks above page | ||
1487 | * lock so we have to do some magic. | ||
1492 | * | 1488 | * |
1493 | * In all journalling modes block_write_full_page() will start the I/O. | 1489 | * In all journaling modes block_write_full_page() will start the I/O. |
1494 | * | 1490 | * |
1495 | * Problem: | 1491 | * Problem: |
1496 | * | 1492 | * |
@@ -1533,86 +1529,7 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | |||
1533 | * us. | 1529 | * us. |
1534 | * | 1530 | * |
1535 | */ | 1531 | */ |
1536 | static int __ext4_ordered_writepage(struct page *page, | 1532 | static int __ext4_normal_writepage(struct page *page, |
1537 | struct writeback_control *wbc) | ||
1538 | { | ||
1539 | struct inode *inode = page->mapping->host; | ||
1540 | struct buffer_head *page_bufs; | ||
1541 | handle_t *handle = NULL; | ||
1542 | int ret = 0; | ||
1543 | int err; | ||
1544 | |||
1545 | if (!page_has_buffers(page)) { | ||
1546 | create_empty_buffers(page, inode->i_sb->s_blocksize, | ||
1547 | (1 << BH_Dirty)|(1 << BH_Uptodate)); | ||
1548 | } | ||
1549 | page_bufs = page_buffers(page); | ||
1550 | walk_page_buffers(handle, page_bufs, 0, | ||
1551 | PAGE_CACHE_SIZE, NULL, bget_one); | ||
1552 | |||
1553 | ret = block_write_full_page(page, ext4_get_block, wbc); | ||
1554 | |||
1555 | /* | ||
1556 | * The page can become unlocked at any point now, and | ||
1557 | * truncate can then come in and change things. So we | ||
1558 | * can't touch *page from now on. But *page_bufs is | ||
1559 | * safe due to elevated refcount. | ||
1560 | */ | ||
1561 | |||
1562 | /* | ||
1563 | * And attach them to the current transaction. But only if | ||
1564 | * block_write_full_page() succeeded. Otherwise they are unmapped, | ||
1565 | * and generally junk. | ||
1566 | */ | ||
1567 | if (ret == 0) { | ||
1568 | handle = ext4_journal_start(inode, | ||
1569 | ext4_writepage_trans_blocks(inode)); | ||
1570 | if (IS_ERR(handle)) { | ||
1571 | ret = PTR_ERR(handle); | ||
1572 | goto out_put; | ||
1573 | } | ||
1574 | |||
1575 | ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, | ||
1576 | NULL, jbd2_journal_dirty_data_fn); | ||
1577 | err = ext4_journal_stop(handle); | ||
1578 | if (!ret) | ||
1579 | ret = err; | ||
1580 | } | ||
1581 | out_put: | ||
1582 | walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, | ||
1583 | bput_one); | ||
1584 | return ret; | ||
1585 | } | ||
1586 | |||
1587 | static int ext4_ordered_writepage(struct page *page, | ||
1588 | struct writeback_control *wbc) | ||
1589 | { | ||
1590 | struct inode *inode = page->mapping->host; | ||
1591 | loff_t size = i_size_read(inode); | ||
1592 | loff_t len; | ||
1593 | |||
1594 | J_ASSERT(PageLocked(page)); | ||
1595 | J_ASSERT(page_has_buffers(page)); | ||
1596 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
1597 | len = size & ~PAGE_CACHE_MASK; | ||
1598 | else | ||
1599 | len = PAGE_CACHE_SIZE; | ||
1600 | BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
1601 | ext4_bh_unmapped_or_delay)); | ||
1602 | |||
1603 | /* | ||
1604 | * We give up here if we're reentered, because it might be for a | ||
1605 | * different filesystem. | ||
1606 | */ | ||
1607 | if (!ext4_journal_current_handle()) | ||
1608 | return __ext4_ordered_writepage(page, wbc); | ||
1609 | |||
1610 | redirty_page_for_writepage(wbc, page); | ||
1611 | unlock_page(page); | ||
1612 | return 0; | ||
1613 | } | ||
1614 | |||
1615 | static int __ext4_writeback_writepage(struct page *page, | ||
1616 | struct writeback_control *wbc) | 1533 | struct writeback_control *wbc) |
1617 | { | 1534 | { |
1618 | struct inode *inode = page->mapping->host; | 1535 | struct inode *inode = page->mapping->host; |
@@ -1624,7 +1541,7 @@ static int __ext4_writeback_writepage(struct page *page, | |||
1624 | } | 1541 | } |
1625 | 1542 | ||
1626 | 1543 | ||
1627 | static int ext4_writeback_writepage(struct page *page, | 1544 | static int ext4_normal_writepage(struct page *page, |
1628 | struct writeback_control *wbc) | 1545 | struct writeback_control *wbc) |
1629 | { | 1546 | { |
1630 | struct inode *inode = page->mapping->host; | 1547 | struct inode *inode = page->mapping->host; |
@@ -1641,7 +1558,7 @@ static int ext4_writeback_writepage(struct page *page, | |||
1641 | ext4_bh_unmapped_or_delay)); | 1558 | ext4_bh_unmapped_or_delay)); |
1642 | 1559 | ||
1643 | if (!ext4_journal_current_handle()) | 1560 | if (!ext4_journal_current_handle()) |
1644 | return __ext4_writeback_writepage(page, wbc); | 1561 | return __ext4_normal_writepage(page, wbc); |
1645 | 1562 | ||
1646 | redirty_page_for_writepage(wbc, page); | 1563 | redirty_page_for_writepage(wbc, page); |
1647 | unlock_page(page); | 1564 | unlock_page(page); |
@@ -1877,7 +1794,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) | |||
1877 | static const struct address_space_operations ext4_ordered_aops = { | 1794 | static const struct address_space_operations ext4_ordered_aops = { |
1878 | .readpage = ext4_readpage, | 1795 | .readpage = ext4_readpage, |
1879 | .readpages = ext4_readpages, | 1796 | .readpages = ext4_readpages, |
1880 | .writepage = ext4_ordered_writepage, | 1797 | .writepage = ext4_normal_writepage, |
1881 | .sync_page = block_sync_page, | 1798 | .sync_page = block_sync_page, |
1882 | .write_begin = ext4_write_begin, | 1799 | .write_begin = ext4_write_begin, |
1883 | .write_end = ext4_ordered_write_end, | 1800 | .write_end = ext4_ordered_write_end, |
@@ -1891,7 +1808,7 @@ static const struct address_space_operations ext4_ordered_aops = { | |||
1891 | static const struct address_space_operations ext4_writeback_aops = { | 1808 | static const struct address_space_operations ext4_writeback_aops = { |
1892 | .readpage = ext4_readpage, | 1809 | .readpage = ext4_readpage, |
1893 | .readpages = ext4_readpages, | 1810 | .readpages = ext4_readpages, |
1894 | .writepage = ext4_writeback_writepage, | 1811 | .writepage = ext4_normal_writepage, |
1895 | .sync_page = block_sync_page, | 1812 | .sync_page = block_sync_page, |
1896 | .write_begin = ext4_write_begin, | 1813 | .write_begin = ext4_write_begin, |
1897 | .write_end = ext4_writeback_write_end, | 1814 | .write_end = ext4_writeback_write_end, |
@@ -2019,7 +1936,7 @@ int ext4_block_truncate_page(handle_t *handle, | |||
2019 | err = ext4_journal_dirty_metadata(handle, bh); | 1936 | err = ext4_journal_dirty_metadata(handle, bh); |
2020 | } else { | 1937 | } else { |
2021 | if (ext4_should_order_data(inode)) | 1938 | if (ext4_should_order_data(inode)) |
2022 | err = ext4_journal_dirty_data(handle, bh); | 1939 | err = ext4_jbd2_file_inode(handle, inode); |
2023 | mark_buffer_dirty(bh); | 1940 | mark_buffer_dirty(bh); |
2024 | } | 1941 | } |
2025 | 1942 | ||
@@ -3171,7 +3088,14 @@ int ext4_write_inode(struct inode *inode, int wait) | |||
3171 | * be freed, so we have a strong guarantee that no future commit will | 3088 | * be freed, so we have a strong guarantee that no future commit will |
3172 | * leave these blocks visible to the user.) | 3089 | * leave these blocks visible to the user.) |
3173 | * | 3090 | * |
3174 | * Called with inode->sem down. | 3091 | * Another thing we have to assure is that if we are in ordered mode |
3092 | * and inode is still attached to the committing transaction, we must | ||
3093 | * we start writeout of all the dirty pages which are being truncated. | ||
3094 | * This way we are sure that all the data written in the previous | ||
3095 | * transaction are already on disk (truncate waits for pages under | ||
3096 | * writeback). | ||
3097 | * | ||
3098 | * Called with inode->i_mutex down. | ||
3175 | */ | 3099 | */ |
3176 | int ext4_setattr(struct dentry *dentry, struct iattr *attr) | 3100 | int ext4_setattr(struct dentry *dentry, struct iattr *attr) |
3177 | { | 3101 | { |
@@ -3237,6 +3161,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
3237 | if (!error) | 3161 | if (!error) |
3238 | error = rc; | 3162 | error = rc; |
3239 | ext4_journal_stop(handle); | 3163 | ext4_journal_stop(handle); |
3164 | |||
3165 | if (ext4_should_order_data(inode)) { | ||
3166 | error = ext4_begin_ordered_truncate(inode, | ||
3167 | attr->ia_size); | ||
3168 | if (error) { | ||
3169 | /* Do as much error cleanup as possible */ | ||
3170 | handle = ext4_journal_start(inode, 3); | ||
3171 | if (IS_ERR(handle)) { | ||
3172 | ext4_orphan_del(NULL, inode); | ||
3173 | goto err_out; | ||
3174 | } | ||
3175 | ext4_orphan_del(handle, inode); | ||
3176 | ext4_journal_stop(handle); | ||
3177 | goto err_out; | ||
3178 | } | ||
3179 | } | ||
3240 | } | 3180 | } |
3241 | 3181 | ||
3242 | rc = inode_setattr(inode, attr); | 3182 | rc = inode_setattr(inode, attr); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1b330cd71ca8..629d0fa27e3a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -573,6 +573,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
573 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); | 573 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); |
574 | INIT_LIST_HEAD(&ei->i_prealloc_list); | 574 | INIT_LIST_HEAD(&ei->i_prealloc_list); |
575 | spin_lock_init(&ei->i_prealloc_lock); | 575 | spin_lock_init(&ei->i_prealloc_lock); |
576 | jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); | ||
576 | return &ei->vfs_inode; | 577 | return &ei->vfs_inode; |
577 | } | 578 | } |
578 | 579 | ||
@@ -637,6 +638,8 @@ static void ext4_clear_inode(struct inode *inode) | |||
637 | EXT4_I(inode)->i_block_alloc_info = NULL; | 638 | EXT4_I(inode)->i_block_alloc_info = NULL; |
638 | if (unlikely(rsv)) | 639 | if (unlikely(rsv)) |
639 | kfree(rsv); | 640 | kfree(rsv); |
641 | jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, | ||
642 | &EXT4_I(inode)->jinode); | ||
640 | } | 643 | } |
641 | 644 | ||
642 | static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) | 645 | static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) |
@@ -3378,7 +3381,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, | |||
3378 | err = ext4_journal_dirty_metadata(handle, bh); | 3381 | err = ext4_journal_dirty_metadata(handle, bh); |
3379 | else { | 3382 | else { |
3380 | /* Always do at least ordered writes for quotas */ | 3383 | /* Always do at least ordered writes for quotas */ |
3381 | err = ext4_journal_dirty_data(handle, bh); | 3384 | err = ext4_jbd2_file_inode(handle, inode); |
3382 | mark_buffer_dirty(bh); | 3385 | mark_buffer_dirty(bh); |
3383 | } | 3386 | } |
3384 | brelse(bh); | 3387 | brelse(bh); |