aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorTao Ma <boyu.mt@taobao.com>2012-12-10 14:05:57 -0500
committerTheodore Ts'o <tytso@mit.edu>2012-12-10 14:05:57 -0500
commit9c3569b50f12e47cc5e907b5e37e4a45c0c10b43 (patch)
treedfd2c1cf5ad98ca059ff2f1f330f34faed2f79eb /fs
parent3fdcfb668fd78ec92d9bc2daddf1d41e2a8a30bb (diff)
ext4: add delalloc support for inline data
For delayed allocation mode, we write to inline data if the file is small enough. And in case of we write to some offset larger than the inline size, the 1st page is dirtied, so that ext4_da_writepages can handle the conversion. When the 1st page is initialized with blocks, the inline part is removed. Signed-off-by: Tao Ma <boyu.mt@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/inline.c177
-rw-r--r--fs/ext4/inode.c63
-rw-r--r--fs/ext4/xattr.h27
4 files changed, 262 insertions, 9 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9f4efc6c37ba..268636af7f5c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2022,6 +2022,8 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
2022 struct buffer_head *bh_result, int create); 2022 struct buffer_head *bh_result, int create);
2023int ext4_get_block(struct inode *inode, sector_t iblock, 2023int ext4_get_block(struct inode *inode, sector_t iblock,
2024 struct buffer_head *bh_result, int create); 2024 struct buffer_head *bh_result, int create);
2025int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2026 struct buffer_head *bh, int create);
2025int ext4_walk_page_buffers(handle_t *handle, 2027int ext4_walk_page_buffers(handle_t *handle,
2026 struct buffer_head *head, 2028 struct buffer_head *head,
2027 unsigned from, 2029 unsigned from,
@@ -2031,6 +2033,8 @@ int ext4_walk_page_buffers(handle_t *handle,
2031 struct buffer_head *bh)); 2033 struct buffer_head *bh));
2032int do_journal_get_write_access(handle_t *handle, 2034int do_journal_get_write_access(handle_t *handle,
2033 struct buffer_head *bh); 2035 struct buffer_head *bh);
2036#define FALL_BACK_TO_NONDELALLOC 1
2037#define CONVERT_INLINE_DATA 2
2034 2038
2035extern struct inode *ext4_iget(struct super_block *, unsigned long); 2039extern struct inode *ext4_iget(struct super_block *, unsigned long);
2036extern int ext4_write_inode(struct inode *, struct writeback_control *); 2040extern int ext4_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 01274b1e7d40..65f7ffb5437f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -771,6 +771,183 @@ ext4_journalled_write_inline_data(struct inode *inode,
771 return iloc.bh; 771 return iloc.bh;
772} 772}
773 773
774/*
775 * Try to make the page cache and handle ready for the inline data case.
776 * We can call this function in 2 cases:
777 * 1. The inode is created and the first write exceeds inline size. We can
778 * clear the inode state safely.
779 * 2. The inode has inline data, then we need to read the data, make it
780 * update and dirty so that ext4_da_writepages can handle it. We don't
781 * need to start the journal since the file's metatdata isn't changed now.
782 */
783static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
784 struct inode *inode,
785 unsigned flags,
786 void **fsdata)
787{
788 int ret = 0, inline_size;
789 struct page *page;
790
791 page = grab_cache_page_write_begin(mapping, 0, flags);
792 if (!page)
793 return -ENOMEM;
794
795 down_read(&EXT4_I(inode)->xattr_sem);
796 if (!ext4_has_inline_data(inode)) {
797 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
798 goto out;
799 }
800
801 inline_size = ext4_get_inline_size(inode);
802
803 if (!PageUptodate(page)) {
804 ret = ext4_read_inline_page(inode, page);
805 if (ret < 0)
806 goto out;
807 }
808
809 ret = __block_write_begin(page, 0, inline_size,
810 ext4_da_get_block_prep);
811 if (ret) {
812 ext4_truncate_failed_write(inode);
813 goto out;
814 }
815
816 SetPageDirty(page);
817 SetPageUptodate(page);
818 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
819 *fsdata = (void *)CONVERT_INLINE_DATA;
820
821out:
822 up_read(&EXT4_I(inode)->xattr_sem);
823 if (page) {
824 unlock_page(page);
825 page_cache_release(page);
826 }
827 return ret;
828}
829
830/*
831 * Prepare the write for the inline data.
832 * If the the data can be written into the inode, we just read
833 * the page and make it uptodate, and start the journal.
834 * Otherwise read the page, makes it dirty so that it can be
835 * handle in writepages(the i_disksize update is left to the
836 * normal ext4_da_write_end).
837 */
838int ext4_da_write_inline_data_begin(struct address_space *mapping,
839 struct inode *inode,
840 loff_t pos, unsigned len,
841 unsigned flags,
842 struct page **pagep,
843 void **fsdata)
844{
845 int ret, inline_size;
846 handle_t *handle;
847 struct page *page;
848 struct ext4_iloc iloc;
849
850 ret = ext4_get_inode_loc(inode, &iloc);
851 if (ret)
852 return ret;
853
854 handle = ext4_journal_start(inode, 1);
855 if (IS_ERR(handle)) {
856 ret = PTR_ERR(handle);
857 handle = NULL;
858 goto out;
859 }
860
861 inline_size = ext4_get_max_inline_size(inode);
862
863 ret = -ENOSPC;
864 if (inline_size >= pos + len) {
865 ret = ext4_prepare_inline_data(handle, inode, pos + len);
866 if (ret && ret != -ENOSPC)
867 goto out;
868 }
869
870 if (ret == -ENOSPC) {
871 ret = ext4_da_convert_inline_data_to_extent(mapping,
872 inode,
873 flags,
874 fsdata);
875 goto out;
876 }
877
878 /*
879 * We cannot recurse into the filesystem as the transaction
880 * is already started.
881 */
882 flags |= AOP_FLAG_NOFS;
883
884 page = grab_cache_page_write_begin(mapping, 0, flags);
885 if (!page) {
886 ret = -ENOMEM;
887 goto out;
888 }
889
890 down_read(&EXT4_I(inode)->xattr_sem);
891 if (!ext4_has_inline_data(inode)) {
892 ret = 0;
893 goto out_release_page;
894 }
895
896 if (!PageUptodate(page)) {
897 ret = ext4_read_inline_page(inode, page);
898 if (ret < 0)
899 goto out_release_page;
900 }
901
902 up_read(&EXT4_I(inode)->xattr_sem);
903 *pagep = page;
904 handle = NULL;
905 brelse(iloc.bh);
906 return 1;
907out_release_page:
908 up_read(&EXT4_I(inode)->xattr_sem);
909 unlock_page(page);
910 page_cache_release(page);
911out:
912 if (handle)
913 ext4_journal_stop(handle);
914 brelse(iloc.bh);
915 return ret;
916}
917
918int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
919 unsigned len, unsigned copied,
920 struct page *page)
921{
922 int i_size_changed = 0;
923
924 copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
925
926 /*
927 * No need to use i_size_read() here, the i_size
928 * cannot change under us because we hold i_mutex.
929 *
930 * But it's important to update i_size while still holding page lock:
931 * page writeout could otherwise come in and zero beyond i_size.
932 */
933 if (pos+copied > inode->i_size) {
934 i_size_write(inode, pos+copied);
935 i_size_changed = 1;
936 }
937 unlock_page(page);
938 page_cache_release(page);
939
940 /*
941 * Don't mark the inode dirty under page lock. First, it unnecessarily
942 * makes the holding time of page lock longer. Second, it forces lock
943 * ordering of page lock and transaction start for journaling
944 * filesystems.
945 */
946 if (i_size_changed)
947 mark_inode_dirty(inode);
948
949 return copied;
950}
774 951
775int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) 952int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
776{ 953{
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5c91622cfe01..f16ae02599cd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1790,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1790 * file system block. 1790 * file system block.
1791 */ 1791 */
1792 down_read((&EXT4_I(inode)->i_data_sem)); 1792 down_read((&EXT4_I(inode)->i_data_sem));
1793 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1793 if (ext4_has_inline_data(inode)) {
1794 /*
1795 * We will soon create blocks for this page, and let
1796 * us pretend as if the blocks aren't allocated yet.
1797 * In case of clusters, we have to handle the work
1798 * of mapping from cluster so that the reserved space
1799 * is calculated properly.
1800 */
1801 if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
1802 ext4_find_delalloc_cluster(inode, map->m_lblk))
1803 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
1804 retval = 0;
1805 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1794 retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1806 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1795 else 1807 else
1796 retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1808 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1841,8 +1853,8 @@ out_unlock:
1841 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 1853 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1842 * initialized properly. 1854 * initialized properly.
1843 */ 1855 */
1844static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 1856int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1845 struct buffer_head *bh, int create) 1857 struct buffer_head *bh, int create)
1846{ 1858{
1847 struct ext4_map_blocks map; 1859 struct ext4_map_blocks map;
1848 int ret = 0; 1860 int ret = 0;
@@ -2119,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2119 * mpage_da_map_and_submit to map a single contiguous memory region 2131 * mpage_da_map_and_submit to map a single contiguous memory region
2120 * and then write them. 2132 * and then write them.
2121 */ 2133 */
2122static int write_cache_pages_da(struct address_space *mapping, 2134static int write_cache_pages_da(handle_t *handle,
2135 struct address_space *mapping,
2123 struct writeback_control *wbc, 2136 struct writeback_control *wbc,
2124 struct mpage_da_data *mpd, 2137 struct mpage_da_data *mpd,
2125 pgoff_t *done_index) 2138 pgoff_t *done_index)
@@ -2198,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
2198 wait_on_page_writeback(page); 2211 wait_on_page_writeback(page);
2199 BUG_ON(PageWriteback(page)); 2212 BUG_ON(PageWriteback(page));
2200 2213
2214 /*
2215 * If we have inline data and arrive here, it means that
2216 * we will soon create the block for the 1st page, so
2217 * we'd better clear the inline data here.
2218 */
2219 if (ext4_has_inline_data(inode)) {
2220 BUG_ON(ext4_test_inode_state(inode,
2221 EXT4_STATE_MAY_INLINE_DATA));
2222 ext4_destroy_inline_data(handle, inode);
2223 }
2224
2201 if (mpd->next_page != page->index) 2225 if (mpd->next_page != page->index)
2202 mpd->first_page = page->index; 2226 mpd->first_page = page->index;
2203 mpd->next_page = page->index + 1; 2227 mpd->next_page = page->index + 1;
@@ -2404,7 +2428,8 @@ retry:
2404 * contiguous region of logical blocks that need 2428 * contiguous region of logical blocks that need
2405 * blocks to be allocated by ext4 and submit them. 2429 * blocks to be allocated by ext4 and submit them.
2406 */ 2430 */
2407 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 2431 ret = write_cache_pages_da(handle, mapping,
2432 wbc, &mpd, &done_index);
2408 /* 2433 /*
2409 * If we have a contiguous extent of pages and we 2434 * If we have a contiguous extent of pages and we
2410 * haven't done the I/O yet, map the blocks and submit 2435 * haven't done the I/O yet, map the blocks and submit
@@ -2468,7 +2493,6 @@ out_writepages:
2468 return ret; 2493 return ret;
2469} 2494}
2470 2495
2471#define FALL_BACK_TO_NONDELALLOC 1
2472static int ext4_nonda_switch(struct super_block *sb) 2496static int ext4_nonda_switch(struct super_block *sb)
2473{ 2497{
2474 s64 free_blocks, dirty_blocks; 2498 s64 free_blocks, dirty_blocks;
@@ -2525,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2525 } 2549 }
2526 *fsdata = (void *)0; 2550 *fsdata = (void *)0;
2527 trace_ext4_da_write_begin(inode, pos, len, flags); 2551 trace_ext4_da_write_begin(inode, pos, len, flags);
2552
2553 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
2554 ret = ext4_da_write_inline_data_begin(mapping, inode,
2555 pos, len, flags,
2556 pagep, fsdata);
2557 if (ret < 0)
2558 goto out;
2559 if (ret == 1) {
2560 ret = 0;
2561 goto out;
2562 }
2563 }
2564
2528retry: 2565retry:
2529 /* 2566 /*
2530 * With delayed allocation, we don't log the i_disksize update 2567 * With delayed allocation, we don't log the i_disksize update
@@ -2626,10 +2663,10 @@ static int ext4_da_write_end(struct file *file,
2626 * changes. So let's piggyback the i_disksize mark_inode_dirty 2663 * changes. So let's piggyback the i_disksize mark_inode_dirty
2627 * into that. 2664 * into that.
2628 */ 2665 */
2629
2630 new_i_size = pos + copied; 2666 new_i_size = pos + copied;
2631 if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 2667 if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
2632 if (ext4_da_should_update_i_disksize(page, end)) { 2668 if (ext4_has_inline_data(inode) ||
2669 ext4_da_should_update_i_disksize(page, end)) {
2633 down_write(&EXT4_I(inode)->i_data_sem); 2670 down_write(&EXT4_I(inode)->i_data_sem);
2634 if (new_i_size > EXT4_I(inode)->i_disksize) 2671 if (new_i_size > EXT4_I(inode)->i_disksize)
2635 EXT4_I(inode)->i_disksize = new_i_size; 2672 EXT4_I(inode)->i_disksize = new_i_size;
@@ -2641,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
2641 ext4_mark_inode_dirty(handle, inode); 2678 ext4_mark_inode_dirty(handle, inode);
2642 } 2679 }
2643 } 2680 }
2644 ret2 = generic_write_end(file, mapping, pos, len, copied, 2681
2682 if (write_mode != CONVERT_INLINE_DATA &&
2683 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
2684 ext4_has_inline_data(inode))
2685 ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
2686 page);
2687 else
2688 ret2 = generic_write_end(file, mapping, pos, len, copied,
2645 page, fsdata); 2689 page, fsdata);
2690
2646 copied = ret2; 2691 copied = ret2;
2647 if (ret2 < 0) 2692 if (ret2 < 0)
2648 ret = ret2; 2693 ret = ret2;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 7095ac13fbc2..37e66f867645 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -154,6 +154,15 @@ extern struct buffer_head *
154ext4_journalled_write_inline_data(struct inode *inode, 154ext4_journalled_write_inline_data(struct inode *inode,
155 unsigned len, 155 unsigned len,
156 struct page *page); 156 struct page *page);
157extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
158 struct inode *inode,
159 loff_t pos, unsigned len,
160 unsigned flags,
161 struct page **pagep,
162 void **fsdata);
163extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
164 unsigned len, unsigned copied,
165 struct page *page);
157# else /* CONFIG_EXT4_FS_XATTR */ 166# else /* CONFIG_EXT4_FS_XATTR */
158 167
159static inline int 168static inline int
@@ -300,6 +309,24 @@ ext4_journalled_write_inline_data(struct inode *inode,
300{ 309{
301 return NULL; 310 return NULL;
302} 311}
312
313static inline int
314ext4_da_write_inline_data_begin(struct address_space *mapping,
315 struct inode *inode,
316 loff_t pos, unsigned len,
317 unsigned flags,
318 struct page **pagep,
319 void **fsdata)
320{
321 return 0;
322}
323
324static inline int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
325 unsigned len, unsigned copied,
326 struct page *page)
327{
328 return 0;
329}
303# endif /* CONFIG_EXT4_FS_XATTR */ 330# endif /* CONFIG_EXT4_FS_XATTR */
304 331
305#ifdef CONFIG_EXT4_FS_SECURITY 332#ifdef CONFIG_EXT4_FS_SECURITY