diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 680 |
1 files changed, 579 insertions, 101 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9c642b22efa..5c5bc5dafff8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/namei.h> | 37 | #include <linux/namei.h> |
38 | #include <linux/uio.h> | 38 | #include <linux/uio.h> |
39 | #include <linux/bio.h> | 39 | #include <linux/bio.h> |
40 | #include <linux/workqueue.h> | ||
40 | 41 | ||
41 | #include "ext4_jbd2.h" | 42 | #include "ext4_jbd2.h" |
42 | #include "xattr.h" | 43 | #include "xattr.h" |
@@ -192,11 +193,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | |||
192 | * so before we call here everything must be consistently dirtied against | 193 | * so before we call here everything must be consistently dirtied against |
193 | * this transaction. | 194 | * this transaction. |
194 | */ | 195 | */ |
195 | static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) | 196 | int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, |
197 | int nblocks) | ||
196 | { | 198 | { |
199 | int ret; | ||
200 | |||
201 | /* | ||
202 | * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this | ||
203 | * moment, get_block can be called only for blocks inside i_size since | ||
204 | * page cache has been already dropped and writes are blocked by | ||
205 | * i_mutex. So we can safely drop the i_data_sem here. | ||
206 | */ | ||
197 | BUG_ON(EXT4_JOURNAL(inode) == NULL); | 207 | BUG_ON(EXT4_JOURNAL(inode) == NULL); |
198 | jbd_debug(2, "restarting handle %p\n", handle); | 208 | jbd_debug(2, "restarting handle %p\n", handle); |
199 | return ext4_journal_restart(handle, blocks_for_truncate(inode)); | 209 | up_write(&EXT4_I(inode)->i_data_sem); |
210 | ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); | ||
211 | down_write(&EXT4_I(inode)->i_data_sem); | ||
212 | |||
213 | return ret; | ||
200 | } | 214 | } |
201 | 215 | ||
202 | /* | 216 | /* |
@@ -341,9 +355,7 @@ static int ext4_block_to_path(struct inode *inode, | |||
341 | int n = 0; | 355 | int n = 0; |
342 | int final = 0; | 356 | int final = 0; |
343 | 357 | ||
344 | if (i_block < 0) { | 358 | if (i_block < direct_blocks) { |
345 | ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); | ||
346 | } else if (i_block < direct_blocks) { | ||
347 | offsets[n++] = i_block; | 359 | offsets[n++] = i_block; |
348 | final = direct_blocks; | 360 | final = direct_blocks; |
349 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | 361 | } else if ((i_block -= direct_blocks) < indirect_blocks) { |
@@ -551,15 +563,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | |||
551 | * | 563 | * |
552 | * Normally this function find the preferred place for block allocation, | 564 | * Normally this function find the preferred place for block allocation, |
553 | * returns it. | 565 | * returns it. |
566 | * Because this is only used for non-extent files, we limit the block nr | ||
567 | * to 32 bits. | ||
554 | */ | 568 | */ |
555 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | 569 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, |
556 | Indirect *partial) | 570 | Indirect *partial) |
557 | { | 571 | { |
572 | ext4_fsblk_t goal; | ||
573 | |||
558 | /* | 574 | /* |
559 | * XXX need to get goal block from mballoc's data structures | 575 | * XXX need to get goal block from mballoc's data structures |
560 | */ | 576 | */ |
561 | 577 | ||
562 | return ext4_find_near(inode, partial); | 578 | goal = ext4_find_near(inode, partial); |
579 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
580 | return goal; | ||
563 | } | 581 | } |
564 | 582 | ||
565 | /** | 583 | /** |
@@ -640,6 +658,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
640 | if (*err) | 658 | if (*err) |
641 | goto failed_out; | 659 | goto failed_out; |
642 | 660 | ||
661 | BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); | ||
662 | |||
643 | target -= count; | 663 | target -= count; |
644 | /* allocate blocks for indirect blocks */ | 664 | /* allocate blocks for indirect blocks */ |
645 | while (index < indirect_blks && count) { | 665 | while (index < indirect_blks && count) { |
@@ -674,6 +694,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
674 | ar.flags = EXT4_MB_HINT_DATA; | 694 | ar.flags = EXT4_MB_HINT_DATA; |
675 | 695 | ||
676 | current_block = ext4_mb_new_blocks(handle, &ar, err); | 696 | current_block = ext4_mb_new_blocks(handle, &ar, err); |
697 | BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); | ||
677 | 698 | ||
678 | if (*err && (target == blks)) { | 699 | if (*err && (target == blks)) { |
679 | /* | 700 | /* |
@@ -762,8 +783,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |||
762 | BUFFER_TRACE(bh, "call get_create_access"); | 783 | BUFFER_TRACE(bh, "call get_create_access"); |
763 | err = ext4_journal_get_create_access(handle, bh); | 784 | err = ext4_journal_get_create_access(handle, bh); |
764 | if (err) { | 785 | if (err) { |
786 | /* Don't brelse(bh) here; it's done in | ||
787 | * ext4_journal_forget() below */ | ||
765 | unlock_buffer(bh); | 788 | unlock_buffer(bh); |
766 | brelse(bh); | ||
767 | goto failed; | 789 | goto failed; |
768 | } | 790 | } |
769 | 791 | ||
@@ -1109,22 +1131,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) | |||
1109 | ext4_discard_preallocations(inode); | 1131 | ext4_discard_preallocations(inode); |
1110 | } | 1132 | } |
1111 | 1133 | ||
1112 | static int check_block_validity(struct inode *inode, sector_t logical, | 1134 | static int check_block_validity(struct inode *inode, const char *msg, |
1113 | sector_t phys, int len) | 1135 | sector_t logical, sector_t phys, int len) |
1114 | { | 1136 | { |
1115 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { | 1137 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { |
1116 | ext4_error(inode->i_sb, "check_block_validity", | 1138 | ext4_error(inode->i_sb, msg, |
1117 | "inode #%lu logical block %llu mapped to %llu " | 1139 | "inode #%lu logical block %llu mapped to %llu " |
1118 | "(size %d)", inode->i_ino, | 1140 | "(size %d)", inode->i_ino, |
1119 | (unsigned long long) logical, | 1141 | (unsigned long long) logical, |
1120 | (unsigned long long) phys, len); | 1142 | (unsigned long long) phys, len); |
1121 | WARN_ON(1); | ||
1122 | return -EIO; | 1143 | return -EIO; |
1123 | } | 1144 | } |
1124 | return 0; | 1145 | return 0; |
1125 | } | 1146 | } |
1126 | 1147 | ||
1127 | /* | 1148 | /* |
1149 | * Return the number of contiguous dirty pages in a given inode | ||
1150 | * starting at page frame idx. | ||
1151 | */ | ||
1152 | static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, | ||
1153 | unsigned int max_pages) | ||
1154 | { | ||
1155 | struct address_space *mapping = inode->i_mapping; | ||
1156 | pgoff_t index; | ||
1157 | struct pagevec pvec; | ||
1158 | pgoff_t num = 0; | ||
1159 | int i, nr_pages, done = 0; | ||
1160 | |||
1161 | if (max_pages == 0) | ||
1162 | return 0; | ||
1163 | pagevec_init(&pvec, 0); | ||
1164 | while (!done) { | ||
1165 | index = idx; | ||
1166 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
1167 | PAGECACHE_TAG_DIRTY, | ||
1168 | (pgoff_t)PAGEVEC_SIZE); | ||
1169 | if (nr_pages == 0) | ||
1170 | break; | ||
1171 | for (i = 0; i < nr_pages; i++) { | ||
1172 | struct page *page = pvec.pages[i]; | ||
1173 | struct buffer_head *bh, *head; | ||
1174 | |||
1175 | lock_page(page); | ||
1176 | if (unlikely(page->mapping != mapping) || | ||
1177 | !PageDirty(page) || | ||
1178 | PageWriteback(page) || | ||
1179 | page->index != idx) { | ||
1180 | done = 1; | ||
1181 | unlock_page(page); | ||
1182 | break; | ||
1183 | } | ||
1184 | if (page_has_buffers(page)) { | ||
1185 | bh = head = page_buffers(page); | ||
1186 | do { | ||
1187 | if (!buffer_delay(bh) && | ||
1188 | !buffer_unwritten(bh)) | ||
1189 | done = 1; | ||
1190 | bh = bh->b_this_page; | ||
1191 | } while (!done && (bh != head)); | ||
1192 | } | ||
1193 | unlock_page(page); | ||
1194 | if (done) | ||
1195 | break; | ||
1196 | idx++; | ||
1197 | num++; | ||
1198 | if (num >= max_pages) | ||
1199 | break; | ||
1200 | } | ||
1201 | pagevec_release(&pvec); | ||
1202 | } | ||
1203 | return num; | ||
1204 | } | ||
1205 | |||
1206 | /* | ||
1128 | * The ext4_get_blocks() function tries to look up the requested blocks, | 1207 | * The ext4_get_blocks() function tries to look up the requested blocks, |
1129 | * and returns if the blocks are already mapped. | 1208 | * and returns if the blocks are already mapped. |
1130 | * | 1209 | * |
@@ -1155,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | |||
1155 | clear_buffer_mapped(bh); | 1234 | clear_buffer_mapped(bh); |
1156 | clear_buffer_unwritten(bh); | 1235 | clear_buffer_unwritten(bh); |
1157 | 1236 | ||
1237 | ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," | ||
1238 | "logical block %lu\n", inode->i_ino, flags, max_blocks, | ||
1239 | (unsigned long)block); | ||
1158 | /* | 1240 | /* |
1159 | * Try to see if we can get the block without requesting a new | 1241 | * Try to see if we can get the block without requesting a new |
1160 | * file system block. | 1242 | * file system block. |
@@ -1170,8 +1252,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | |||
1170 | up_read((&EXT4_I(inode)->i_data_sem)); | 1252 | up_read((&EXT4_I(inode)->i_data_sem)); |
1171 | 1253 | ||
1172 | if (retval > 0 && buffer_mapped(bh)) { | 1254 | if (retval > 0 && buffer_mapped(bh)) { |
1173 | int ret = check_block_validity(inode, block, | 1255 | int ret = check_block_validity(inode, "file system corruption", |
1174 | bh->b_blocknr, retval); | 1256 | block, bh->b_blocknr, retval); |
1175 | if (ret != 0) | 1257 | if (ret != 0) |
1176 | return ret; | 1258 | return ret; |
1177 | } | 1259 | } |
@@ -1235,8 +1317,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | |||
1235 | * i_data's format changing. Force the migrate | 1317 | * i_data's format changing. Force the migrate |
1236 | * to fail by clearing migrate flags | 1318 | * to fail by clearing migrate flags |
1237 | */ | 1319 | */ |
1238 | EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & | 1320 | EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; |
1239 | ~EXT4_EXT_MIGRATE; | ||
1240 | } | 1321 | } |
1241 | } | 1322 | } |
1242 | 1323 | ||
@@ -1252,8 +1333,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | |||
1252 | 1333 | ||
1253 | up_write((&EXT4_I(inode)->i_data_sem)); | 1334 | up_write((&EXT4_I(inode)->i_data_sem)); |
1254 | if (retval > 0 && buffer_mapped(bh)) { | 1335 | if (retval > 0 && buffer_mapped(bh)) { |
1255 | int ret = check_block_validity(inode, block, | 1336 | int ret = check_block_validity(inode, "file system " |
1256 | bh->b_blocknr, retval); | 1337 | "corruption after allocation", |
1338 | block, bh->b_blocknr, retval); | ||
1257 | if (ret != 0) | 1339 | if (ret != 0) |
1258 | return ret; | 1340 | return ret; |
1259 | } | 1341 | } |
@@ -1776,11 +1858,11 @@ repeat: | |||
1776 | 1858 | ||
1777 | if (ext4_claim_free_blocks(sbi, total)) { | 1859 | if (ext4_claim_free_blocks(sbi, total)) { |
1778 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1860 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1861 | vfs_dq_release_reservation_block(inode, total); | ||
1779 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { | 1862 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { |
1780 | yield(); | 1863 | yield(); |
1781 | goto repeat; | 1864 | goto repeat; |
1782 | } | 1865 | } |
1783 | vfs_dq_release_reservation_block(inode, total); | ||
1784 | return -ENOSPC; | 1866 | return -ENOSPC; |
1785 | } | 1867 | } |
1786 | EXT4_I(inode)->i_reserved_data_blocks += nrblocks; | 1868 | EXT4_I(inode)->i_reserved_data_blocks += nrblocks; |
@@ -1863,18 +1945,6 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
1863 | * Delayed allocation stuff | 1945 | * Delayed allocation stuff |
1864 | */ | 1946 | */ |
1865 | 1947 | ||
1866 | struct mpage_da_data { | ||
1867 | struct inode *inode; | ||
1868 | sector_t b_blocknr; /* start block number of extent */ | ||
1869 | size_t b_size; /* size of extent */ | ||
1870 | unsigned long b_state; /* state of the extent */ | ||
1871 | unsigned long first_page, next_page; /* extent of pages */ | ||
1872 | struct writeback_control *wbc; | ||
1873 | int io_done; | ||
1874 | int pages_written; | ||
1875 | int retval; | ||
1876 | }; | ||
1877 | |||
1878 | /* | 1948 | /* |
1879 | * mpage_da_submit_io - walks through extent of pages and try to write | 1949 | * mpage_da_submit_io - walks through extent of pages and try to write |
1880 | * them with writepage() call back | 1950 | * them with writepage() call back |
@@ -2084,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, | |||
2084 | static void ext4_print_free_blocks(struct inode *inode) | 2154 | static void ext4_print_free_blocks(struct inode *inode) |
2085 | { | 2155 | { |
2086 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2156 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
2087 | printk(KERN_EMERG "Total free blocks count %lld\n", | 2157 | printk(KERN_CRIT "Total free blocks count %lld\n", |
2088 | ext4_count_free_blocks(inode->i_sb)); | 2158 | ext4_count_free_blocks(inode->i_sb)); |
2089 | printk(KERN_EMERG "Free/Dirty block details\n"); | 2159 | printk(KERN_CRIT "Free/Dirty block details\n"); |
2090 | printk(KERN_EMERG "free_blocks=%lld\n", | 2160 | printk(KERN_CRIT "free_blocks=%lld\n", |
2091 | (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); | 2161 | (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); |
2092 | printk(KERN_EMERG "dirty_blocks=%lld\n", | 2162 | printk(KERN_CRIT "dirty_blocks=%lld\n", |
2093 | (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); | 2163 | (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); |
2094 | printk(KERN_EMERG "Block reservation details\n"); | 2164 | printk(KERN_CRIT "Block reservation details\n"); |
2095 | printk(KERN_EMERG "i_reserved_data_blocks=%u\n", | 2165 | printk(KERN_CRIT "i_reserved_data_blocks=%u\n", |
2096 | EXT4_I(inode)->i_reserved_data_blocks); | 2166 | EXT4_I(inode)->i_reserved_data_blocks); |
2097 | printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", | 2167 | printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", |
2098 | EXT4_I(inode)->i_reserved_meta_blocks); | 2168 | EXT4_I(inode)->i_reserved_meta_blocks); |
2099 | return; | 2169 | return; |
2100 | } | 2170 | } |
2101 | 2171 | ||
@@ -2181,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) | |||
2181 | * writepage and writepages will again try to write | 2251 | * writepage and writepages will again try to write |
2182 | * the same. | 2252 | * the same. |
2183 | */ | 2253 | */ |
2184 | printk(KERN_EMERG "%s block allocation failed for inode %lu " | 2254 | ext4_msg(mpd->inode->i_sb, KERN_CRIT, |
2185 | "at logical offset %llu with max blocks " | 2255 | "delayed block allocation failed for inode %lu at " |
2186 | "%zd with error %d\n", | 2256 | "logical offset %llu with max blocks %zd with " |
2187 | __func__, mpd->inode->i_ino, | 2257 | "error %d\n", mpd->inode->i_ino, |
2188 | (unsigned long long)next, | 2258 | (unsigned long long) next, |
2189 | mpd->b_size >> mpd->inode->i_blkbits, err); | 2259 | mpd->b_size >> mpd->inode->i_blkbits, err); |
2190 | printk(KERN_EMERG "This should not happen.!! " | 2260 | printk(KERN_CRIT "This should not happen!! " |
2191 | "Data will be lost\n"); | 2261 | "Data will be lost\n"); |
2192 | if (err == -ENOSPC) { | 2262 | if (err == -ENOSPC) { |
2193 | ext4_print_free_blocks(mpd->inode); | 2263 | ext4_print_free_blocks(mpd->inode); |
2194 | } | 2264 | } |
@@ -2329,7 +2399,7 @@ static int __mpage_da_writepage(struct page *page, | |||
2329 | /* | 2399 | /* |
2330 | * Rest of the page in the page_vec | 2400 | * Rest of the page in the page_vec |
2331 | * redirty then and skip then. We will | 2401 | * redirty then and skip then. We will |
2332 | * try to to write them again after | 2402 | * try to write them again after |
2333 | * starting a new transaction | 2403 | * starting a new transaction |
2334 | */ | 2404 | */ |
2335 | redirty_page_for_writepage(wbc, page); | 2405 | redirty_page_for_writepage(wbc, page); |
@@ -2735,8 +2805,11 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2735 | int no_nrwrite_index_update; | 2805 | int no_nrwrite_index_update; |
2736 | int pages_written = 0; | 2806 | int pages_written = 0; |
2737 | long pages_skipped; | 2807 | long pages_skipped; |
2808 | unsigned int max_pages; | ||
2738 | int range_cyclic, cycled = 1, io_done = 0; | 2809 | int range_cyclic, cycled = 1, io_done = 0; |
2739 | int needed_blocks, ret = 0, nr_to_writebump = 0; | 2810 | int needed_blocks, ret = 0; |
2811 | long desired_nr_to_write, nr_to_writebump = 0; | ||
2812 | loff_t range_start = wbc->range_start; | ||
2740 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | 2813 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
2741 | 2814 | ||
2742 | trace_ext4_da_writepages(inode, wbc); | 2815 | trace_ext4_da_writepages(inode, wbc); |
@@ -2762,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2762 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) | 2835 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) |
2763 | return -EROFS; | 2836 | return -EROFS; |
2764 | 2837 | ||
2765 | /* | ||
2766 | * Make sure nr_to_write is >= sbi->s_mb_stream_request | ||
2767 | * This make sure small files blocks are allocated in | ||
2768 | * single attempt. This ensure that small files | ||
2769 | * get less fragmented. | ||
2770 | */ | ||
2771 | if (wbc->nr_to_write < sbi->s_mb_stream_request) { | ||
2772 | nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; | ||
2773 | wbc->nr_to_write = sbi->s_mb_stream_request; | ||
2774 | } | ||
2775 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 2838 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
2776 | range_whole = 1; | 2839 | range_whole = 1; |
2777 | 2840 | ||
@@ -2786,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2786 | } else | 2849 | } else |
2787 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2850 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2788 | 2851 | ||
2852 | /* | ||
2853 | * This works around two forms of stupidity. The first is in | ||
2854 | * the writeback code, which caps the maximum number of pages | ||
2855 | * written to be 1024 pages. This is wrong on multiple | ||
2856 | * levels; different architectues have a different page size, | ||
2857 | * which changes the maximum amount of data which gets | ||
2858 | * written. Secondly, 4 megabytes is way too small. XFS | ||
2859 | * forces this value to be 16 megabytes by multiplying | ||
2860 | * nr_to_write parameter by four, and then relies on its | ||
2861 | * allocator to allocate larger extents to make them | ||
2862 | * contiguous. Unfortunately this brings us to the second | ||
2863 | * stupidity, which is that ext4's mballoc code only allocates | ||
2864 | * at most 2048 blocks. So we force contiguous writes up to | ||
2865 | * the number of dirty blocks in the inode, or | ||
2866 | * sbi->max_writeback_mb_bump whichever is smaller. | ||
2867 | */ | ||
2868 | max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); | ||
2869 | if (!range_cyclic && range_whole) | ||
2870 | desired_nr_to_write = wbc->nr_to_write * 8; | ||
2871 | else | ||
2872 | desired_nr_to_write = ext4_num_dirty_pages(inode, index, | ||
2873 | max_pages); | ||
2874 | if (desired_nr_to_write > max_pages) | ||
2875 | desired_nr_to_write = max_pages; | ||
2876 | |||
2877 | if (wbc->nr_to_write < desired_nr_to_write) { | ||
2878 | nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; | ||
2879 | wbc->nr_to_write = desired_nr_to_write; | ||
2880 | } | ||
2881 | |||
2789 | mpd.wbc = wbc; | 2882 | mpd.wbc = wbc; |
2790 | mpd.inode = mapping->host; | 2883 | mpd.inode = mapping->host; |
2791 | 2884 | ||
@@ -2813,10 +2906,9 @@ retry: | |||
2813 | handle = ext4_journal_start(inode, needed_blocks); | 2906 | handle = ext4_journal_start(inode, needed_blocks); |
2814 | if (IS_ERR(handle)) { | 2907 | if (IS_ERR(handle)) { |
2815 | ret = PTR_ERR(handle); | 2908 | ret = PTR_ERR(handle); |
2816 | printk(KERN_CRIT "%s: jbd2_start: " | 2909 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " |
2817 | "%ld pages, ino %lu; err %d\n", __func__, | 2910 | "%ld pages, ino %lu; err %d\n", __func__, |
2818 | wbc->nr_to_write, inode->i_ino, ret); | 2911 | wbc->nr_to_write, inode->i_ino, ret); |
2819 | dump_stack(); | ||
2820 | goto out_writepages; | 2912 | goto out_writepages; |
2821 | } | 2913 | } |
2822 | 2914 | ||
@@ -2850,6 +2942,7 @@ retry: | |||
2850 | mpd.io_done = 1; | 2942 | mpd.io_done = 1; |
2851 | ret = MPAGE_DA_EXTENT_TAIL; | 2943 | ret = MPAGE_DA_EXTENT_TAIL; |
2852 | } | 2944 | } |
2945 | trace_ext4_da_write_pages(inode, &mpd); | ||
2853 | wbc->nr_to_write -= mpd.pages_written; | 2946 | wbc->nr_to_write -= mpd.pages_written; |
2854 | 2947 | ||
2855 | ext4_journal_stop(handle); | 2948 | ext4_journal_stop(handle); |
@@ -2887,9 +2980,10 @@ retry: | |||
2887 | goto retry; | 2980 | goto retry; |
2888 | } | 2981 | } |
2889 | if (pages_skipped != wbc->pages_skipped) | 2982 | if (pages_skipped != wbc->pages_skipped) |
2890 | printk(KERN_EMERG "This should not happen leaving %s " | 2983 | ext4_msg(inode->i_sb, KERN_CRIT, |
2891 | "with nr_to_write = %ld ret = %d\n", | 2984 | "This should not happen leaving %s " |
2892 | __func__, wbc->nr_to_write, ret); | 2985 | "with nr_to_write = %ld ret = %d\n", |
2986 | __func__, wbc->nr_to_write, ret); | ||
2893 | 2987 | ||
2894 | /* Update index */ | 2988 | /* Update index */ |
2895 | index += pages_written; | 2989 | index += pages_written; |
@@ -2904,7 +2998,9 @@ retry: | |||
2904 | out_writepages: | 2998 | out_writepages: |
2905 | if (!no_nrwrite_index_update) | 2999 | if (!no_nrwrite_index_update) |
2906 | wbc->no_nrwrite_index_update = 0; | 3000 | wbc->no_nrwrite_index_update = 0; |
2907 | wbc->nr_to_write -= nr_to_writebump; | 3001 | if (wbc->nr_to_write > nr_to_writebump) |
3002 | wbc->nr_to_write -= nr_to_writebump; | ||
3003 | wbc->range_start = range_start; | ||
2908 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); | 3004 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); |
2909 | return ret; | 3005 | return ret; |
2910 | } | 3006 | } |
@@ -3117,6 +3213,8 @@ out: | |||
3117 | */ | 3213 | */ |
3118 | int ext4_alloc_da_blocks(struct inode *inode) | 3214 | int ext4_alloc_da_blocks(struct inode *inode) |
3119 | { | 3215 | { |
3216 | trace_ext4_alloc_da_blocks(inode); | ||
3217 | |||
3120 | if (!EXT4_I(inode)->i_reserved_data_blocks && | 3218 | if (!EXT4_I(inode)->i_reserved_data_blocks && |
3121 | !EXT4_I(inode)->i_reserved_meta_blocks) | 3219 | !EXT4_I(inode)->i_reserved_meta_blocks) |
3122 | return 0; | 3220 | return 0; |
@@ -3259,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3259 | } | 3357 | } |
3260 | 3358 | ||
3261 | /* | 3359 | /* |
3360 | * O_DIRECT for ext3 (or indirect map) based files | ||
3361 | * | ||
3262 | * If the O_DIRECT write will extend the file then add this inode to the | 3362 | * If the O_DIRECT write will extend the file then add this inode to the |
3263 | * orphan list. So recovery will truncate it back to the original size | 3363 | * orphan list. So recovery will truncate it back to the original size |
3264 | * if the machine crashes during the write. | 3364 | * if the machine crashes during the write. |
@@ -3267,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3267 | * crashes then stale disk data _may_ be exposed inside the file. But current | 3367 | * crashes then stale disk data _may_ be exposed inside the file. But current |
3268 | * VFS code falls back into buffered path in that case so we are safe. | 3368 | * VFS code falls back into buffered path in that case so we are safe. |
3269 | */ | 3369 | */ |
3270 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | 3370 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, |
3271 | const struct iovec *iov, loff_t offset, | 3371 | const struct iovec *iov, loff_t offset, |
3272 | unsigned long nr_segs) | 3372 | unsigned long nr_segs) |
3273 | { | 3373 | { |
@@ -3278,6 +3378,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | |||
3278 | ssize_t ret; | 3378 | ssize_t ret; |
3279 | int orphan = 0; | 3379 | int orphan = 0; |
3280 | size_t count = iov_length(iov, nr_segs); | 3380 | size_t count = iov_length(iov, nr_segs); |
3381 | int retries = 0; | ||
3281 | 3382 | ||
3282 | if (rw == WRITE) { | 3383 | if (rw == WRITE) { |
3283 | loff_t final_size = offset + count; | 3384 | loff_t final_size = offset + count; |
@@ -3300,9 +3401,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | |||
3300 | } | 3401 | } |
3301 | } | 3402 | } |
3302 | 3403 | ||
3404 | retry: | ||
3303 | ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, | 3405 | ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, |
3304 | offset, nr_segs, | 3406 | offset, nr_segs, |
3305 | ext4_get_block, NULL); | 3407 | ext4_get_block, NULL); |
3408 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
3409 | goto retry; | ||
3306 | 3410 | ||
3307 | if (orphan) { | 3411 | if (orphan) { |
3308 | int err; | 3412 | int err; |
@@ -3341,6 +3445,359 @@ out: | |||
3341 | return ret; | 3445 | return ret; |
3342 | } | 3446 | } |
3343 | 3447 | ||
3448 | /* Maximum number of blocks we map for direct IO at once. */ | ||
3449 | |||
3450 | static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, | ||
3451 | struct buffer_head *bh_result, int create) | ||
3452 | { | ||
3453 | handle_t *handle = NULL; | ||
3454 | int ret = 0; | ||
3455 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
3456 | int dio_credits; | ||
3457 | |||
3458 | ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", | ||
3459 | inode->i_ino, create); | ||
3460 | /* | ||
3461 | * DIO VFS code passes create = 0 flag for write to | ||
3462 | * the middle of file. It does this to avoid block | ||
3463 | * allocation for holes, to prevent expose stale data | ||
3464 | * out when there is parallel buffered read (which does | ||
3465 | * not hold the i_mutex lock) while direct IO write has | ||
3466 | * not completed. DIO request on holes finally falls back | ||
3467 | * to buffered IO for this reason. | ||
3468 | * | ||
3469 | * For ext4 extent based file, since we support fallocate, | ||
3470 | * new allocated extent as uninitialized, for holes, we | ||
3471 | * could fallocate blocks for holes, thus parallel | ||
3472 | * buffered IO read will zero out the page when read on | ||
3473 | * a hole while parallel DIO write to the hole has not completed. | ||
3474 | * | ||
3475 | * when we come here, we know it's a direct IO write to | ||
3476 | * to the middle of file (<i_size) | ||
3477 | * so it's safe to override the create flag from VFS. | ||
3478 | */ | ||
3479 | create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; | ||
3480 | |||
3481 | if (max_blocks > DIO_MAX_BLOCKS) | ||
3482 | max_blocks = DIO_MAX_BLOCKS; | ||
3483 | dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); | ||
3484 | handle = ext4_journal_start(inode, dio_credits); | ||
3485 | if (IS_ERR(handle)) { | ||
3486 | ret = PTR_ERR(handle); | ||
3487 | goto out; | ||
3488 | } | ||
3489 | ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, | ||
3490 | create); | ||
3491 | if (ret > 0) { | ||
3492 | bh_result->b_size = (ret << inode->i_blkbits); | ||
3493 | ret = 0; | ||
3494 | } | ||
3495 | ext4_journal_stop(handle); | ||
3496 | out: | ||
3497 | return ret; | ||
3498 | } | ||
3499 | |||
3500 | static void ext4_free_io_end(ext4_io_end_t *io) | ||
3501 | { | ||
3502 | BUG_ON(!io); | ||
3503 | iput(io->inode); | ||
3504 | kfree(io); | ||
3505 | } | ||
3506 | static void dump_aio_dio_list(struct inode * inode) | ||
3507 | { | ||
3508 | #ifdef EXT4_DEBUG | ||
3509 | struct list_head *cur, *before, *after; | ||
3510 | ext4_io_end_t *io, *io0, *io1; | ||
3511 | |||
3512 | if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ | ||
3513 | ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); | ||
3514 | return; | ||
3515 | } | ||
3516 | |||
3517 | ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); | ||
3518 | list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ | ||
3519 | cur = &io->list; | ||
3520 | before = cur->prev; | ||
3521 | io0 = container_of(before, ext4_io_end_t, list); | ||
3522 | after = cur->next; | ||
3523 | io1 = container_of(after, ext4_io_end_t, list); | ||
3524 | |||
3525 | ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", | ||
3526 | io, inode->i_ino, io0, io1); | ||
3527 | } | ||
3528 | #endif | ||
3529 | } | ||
3530 | |||
3531 | /* | ||
3532 | * check a range of space and convert unwritten extents to written. | ||
3533 | */ | ||
3534 | static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) | ||
3535 | { | ||
3536 | struct inode *inode = io->inode; | ||
3537 | loff_t offset = io->offset; | ||
3538 | size_t size = io->size; | ||
3539 | int ret = 0; | ||
3540 | |||
3541 | ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," | ||
3542 | "list->prev 0x%p\n", | ||
3543 | io, inode->i_ino, io->list.next, io->list.prev); | ||
3544 | |||
3545 | if (list_empty(&io->list)) | ||
3546 | return ret; | ||
3547 | |||
3548 | if (io->flag != DIO_AIO_UNWRITTEN) | ||
3549 | return ret; | ||
3550 | |||
3551 | if (offset + size <= i_size_read(inode)) | ||
3552 | ret = ext4_convert_unwritten_extents(inode, offset, size); | ||
3553 | |||
3554 | if (ret < 0) { | ||
3555 | printk(KERN_EMERG "%s: failed to convert unwritten" | ||
3556 | "extents to written extents, error is %d" | ||
3557 | " io is still on inode %lu aio dio list\n", | ||
3558 | __func__, ret, inode->i_ino); | ||
3559 | return ret; | ||
3560 | } | ||
3561 | |||
3562 | /* clear the DIO AIO unwritten flag */ | ||
3563 | io->flag = 0; | ||
3564 | return ret; | ||
3565 | } | ||
3566 | /* | ||
3567 | * work on completed aio dio IO, to convert unwritten extents to extents | ||
3568 | */ | ||
3569 | static void ext4_end_aio_dio_work(struct work_struct *work) | ||
3570 | { | ||
3571 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
3572 | struct inode *inode = io->inode; | ||
3573 | int ret = 0; | ||
3574 | |||
3575 | mutex_lock(&inode->i_mutex); | ||
3576 | ret = ext4_end_aio_dio_nolock(io); | ||
3577 | if (ret >= 0) { | ||
3578 | if (!list_empty(&io->list)) | ||
3579 | list_del_init(&io->list); | ||
3580 | ext4_free_io_end(io); | ||
3581 | } | ||
3582 | mutex_unlock(&inode->i_mutex); | ||
3583 | } | ||
3584 | /* | ||
3585 | * This function is called from ext4_sync_file(). | ||
3586 | * | ||
3587 | * When AIO DIO IO is completed, the work to convert unwritten | ||
3588 | * extents to written is queued on workqueue but may not get immediately | ||
3589 | * scheduled. When fsync is called, we need to ensure the | ||
3590 | * conversion is complete before fsync returns. | ||
3591 | * The inode keeps track of a list of completed AIO from DIO path | ||
3592 | * that might needs to do the conversion. This function walks through | ||
3593 | * the list and convert the related unwritten extents to written. | ||
3594 | */ | ||
3595 | int flush_aio_dio_completed_IO(struct inode *inode) | ||
3596 | { | ||
3597 | ext4_io_end_t *io; | ||
3598 | int ret = 0; | ||
3599 | int ret2 = 0; | ||
3600 | |||
3601 | if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) | ||
3602 | return ret; | ||
3603 | |||
3604 | dump_aio_dio_list(inode); | ||
3605 | while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ | ||
3606 | io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, | ||
3607 | ext4_io_end_t, list); | ||
3608 | /* | ||
3609 | * Calling ext4_end_aio_dio_nolock() to convert completed | ||
3610 | * IO to written. | ||
3611 | * | ||
3612 | * When ext4_sync_file() is called, run_queue() may already | ||
3613 | * about to flush the work corresponding to this io structure. | ||
3614 | * It will be upset if it founds the io structure related | ||
3615 | * to the work-to-be schedule is freed. | ||
3616 | * | ||
3617 | * Thus we need to keep the io structure still valid here after | ||
3618 | * convertion finished. The io structure has a flag to | ||
3619 | * avoid double converting from both fsync and background work | ||
3620 | * queue work. | ||
3621 | */ | ||
3622 | ret = ext4_end_aio_dio_nolock(io); | ||
3623 | if (ret < 0) | ||
3624 | ret2 = ret; | ||
3625 | else | ||
3626 | list_del_init(&io->list); | ||
3627 | } | ||
3628 | return (ret2 < 0) ? ret2 : 0; | ||
3629 | } | ||
3630 | |||
3631 | static ext4_io_end_t *ext4_init_io_end (struct inode *inode) | ||
3632 | { | ||
3633 | ext4_io_end_t *io = NULL; | ||
3634 | |||
3635 | io = kmalloc(sizeof(*io), GFP_NOFS); | ||
3636 | |||
3637 | if (io) { | ||
3638 | igrab(inode); | ||
3639 | io->inode = inode; | ||
3640 | io->flag = 0; | ||
3641 | io->offset = 0; | ||
3642 | io->size = 0; | ||
3643 | io->error = 0; | ||
3644 | INIT_WORK(&io->work, ext4_end_aio_dio_work); | ||
3645 | INIT_LIST_HEAD(&io->list); | ||
3646 | } | ||
3647 | |||
3648 | return io; | ||
3649 | } | ||
3650 | |||
3651 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | ||
3652 | ssize_t size, void *private) | ||
3653 | { | ||
3654 | ext4_io_end_t *io_end = iocb->private; | ||
3655 | struct workqueue_struct *wq; | ||
3656 | |||
3657 | ext_debug("ext4_end_io_dio(): io_end 0x%p" | ||
3658 | "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", | ||
3659 | iocb->private, io_end->inode->i_ino, iocb, offset, | ||
3660 | size); | ||
3661 | /* if not async direct IO or dio with 0 bytes write, just return */ | ||
3662 | if (!io_end || !size) | ||
3663 | return; | ||
3664 | |||
3665 | /* if not aio dio with unwritten extents, just free io and return */ | ||
3666 | if (io_end->flag != DIO_AIO_UNWRITTEN){ | ||
3667 | ext4_free_io_end(io_end); | ||
3668 | iocb->private = NULL; | ||
3669 | return; | ||
3670 | } | ||
3671 | |||
3672 | io_end->offset = offset; | ||
3673 | io_end->size = size; | ||
3674 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
3675 | |||
3676 | /* queue the work to convert unwritten extents to written */ | ||
3677 | queue_work(wq, &io_end->work); | ||
3678 | |||
3679 | /* Add the io_end to per-inode completed aio dio list*/ | ||
3680 | list_add_tail(&io_end->list, | ||
3681 | &EXT4_I(io_end->inode)->i_aio_dio_complete_list); | ||
3682 | iocb->private = NULL; | ||
3683 | } | ||
3684 | /* | ||
3685 | * For ext4 extent files, ext4 will do direct-io write to holes, | ||
3686 | * preallocated extents, and those write extend the file, no need to | ||
3687 | * fall back to buffered IO. | ||
3688 | * | ||
3689 | * For holes, we fallocate those blocks, mark them as unintialized | ||
3690 | * If those blocks were preallocated, we mark sure they are splited, but | ||
3691 | * still keep the range to write as unintialized. | ||
3692 | * | ||
3693 | * The unwrritten extents will be converted to written when DIO is completed. | ||
3694 | * For async direct IO, since the IO may still pending when return, we | ||
3695 | * set up an end_io call back function, which will do the convertion | ||
3696 | * when async direct IO completed. | ||
3697 | * | ||
3698 | * If the O_DIRECT write will extend the file then add this inode to the | ||
3699 | * orphan list. So recovery will truncate it back to the original size | ||
3700 | * if the machine crashes during the write. | ||
3701 | * | ||
3702 | */ | ||
3703 | static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | ||
3704 | const struct iovec *iov, loff_t offset, | ||
3705 | unsigned long nr_segs) | ||
3706 | { | ||
3707 | struct file *file = iocb->ki_filp; | ||
3708 | struct inode *inode = file->f_mapping->host; | ||
3709 | ssize_t ret; | ||
3710 | size_t count = iov_length(iov, nr_segs); | ||
3711 | |||
3712 | loff_t final_size = offset + count; | ||
3713 | if (rw == WRITE && final_size <= inode->i_size) { | ||
3714 | /* | ||
3715 | * We could direct write to holes and fallocate. | ||
3716 | * | ||
3717 | * Allocated blocks to fill the hole are marked as uninitialized | ||
3718 | * to prevent paralel buffered read to expose the stale data | ||
3719 | * before DIO complete the data IO. | ||
3720 | * | ||
3721 | * As to previously fallocated extents, ext4 get_block | ||
3722 | * will just simply mark the buffer mapped but still | ||
3723 | * keep the extents uninitialized. | ||
3724 | * | ||
3725 | * for non AIO case, we will convert those unwritten extents | ||
3726 | * to written after return back from blockdev_direct_IO. | ||
3727 | * | ||
3728 | * for async DIO, the conversion needs to be defered when | ||
3729 | * the IO is completed. The ext4 end_io callback function | ||
3730 | * will be called to take care of the conversion work. | ||
3731 | * Here for async case, we allocate an io_end structure to | ||
3732 | * hook to the iocb. | ||
3733 | */ | ||
3734 | iocb->private = NULL; | ||
3735 | EXT4_I(inode)->cur_aio_dio = NULL; | ||
3736 | if (!is_sync_kiocb(iocb)) { | ||
3737 | iocb->private = ext4_init_io_end(inode); | ||
3738 | if (!iocb->private) | ||
3739 | return -ENOMEM; | ||
3740 | /* | ||
3741 | * we save the io structure for current async | ||
3742 | * direct IO, so that later ext4_get_blocks() | ||
3743 | * could flag the io structure whether there | ||
3744 | * is a unwritten extents needs to be converted | ||
3745 | * when IO is completed. | ||
3746 | */ | ||
3747 | EXT4_I(inode)->cur_aio_dio = iocb->private; | ||
3748 | } | ||
3749 | |||
3750 | ret = blockdev_direct_IO(rw, iocb, inode, | ||
3751 | inode->i_sb->s_bdev, iov, | ||
3752 | offset, nr_segs, | ||
3753 | ext4_get_block_dio_write, | ||
3754 | ext4_end_io_dio); | ||
3755 | if (iocb->private) | ||
3756 | EXT4_I(inode)->cur_aio_dio = NULL; | ||
3757 | /* | ||
3758 | * The io_end structure takes a reference to the inode, | ||
3759 | * that structure needs to be destroyed and the | ||
3760 | * reference to the inode need to be dropped, when IO is | ||
3761 | * complete, even with 0 byte write, or failed. | ||
3762 | * | ||
3763 | * In the successful AIO DIO case, the io_end structure will be | ||
3764 | * desctroyed and the reference to the inode will be dropped | ||
3765 | * after the end_io call back function is called. | ||
3766 | * | ||
3767 | * In the case there is 0 byte write, or error case, since | ||
3768 | * VFS direct IO won't invoke the end_io call back function, | ||
3769 | * we need to free the end_io structure here. | ||
3770 | */ | ||
3771 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | ||
3772 | ext4_free_io_end(iocb->private); | ||
3773 | iocb->private = NULL; | ||
3774 | } else if (ret > 0) | ||
3775 | /* | ||
3776 | * for non AIO case, since the IO is already | ||
3777 | * completed, we could do the convertion right here | ||
3778 | */ | ||
3779 | ret = ext4_convert_unwritten_extents(inode, | ||
3780 | offset, ret); | ||
3781 | return ret; | ||
3782 | } | ||
3783 | |||
3784 | /* for write the the end of file case, we fall back to old way */ | ||
3785 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3786 | } | ||
3787 | |||
3788 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | ||
3789 | const struct iovec *iov, loff_t offset, | ||
3790 | unsigned long nr_segs) | ||
3791 | { | ||
3792 | struct file *file = iocb->ki_filp; | ||
3793 | struct inode *inode = file->f_mapping->host; | ||
3794 | |||
3795 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | ||
3796 | return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3797 | |||
3798 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3799 | } | ||
3800 | |||
3344 | /* | 3801 | /* |
3345 | * Pages can be marked dirty completely asynchronously from ext4's journalling | 3802 | * Pages can be marked dirty completely asynchronously from ext4's journalling |
3346 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do | 3803 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do |
@@ -3373,6 +3830,7 @@ static const struct address_space_operations ext4_ordered_aops = { | |||
3373 | .direct_IO = ext4_direct_IO, | 3830 | .direct_IO = ext4_direct_IO, |
3374 | .migratepage = buffer_migrate_page, | 3831 | .migratepage = buffer_migrate_page, |
3375 | .is_partially_uptodate = block_is_partially_uptodate, | 3832 | .is_partially_uptodate = block_is_partially_uptodate, |
3833 | .error_remove_page = generic_error_remove_page, | ||
3376 | }; | 3834 | }; |
3377 | 3835 | ||
3378 | static const struct address_space_operations ext4_writeback_aops = { | 3836 | static const struct address_space_operations ext4_writeback_aops = { |
@@ -3388,6 +3846,7 @@ static const struct address_space_operations ext4_writeback_aops = { | |||
3388 | .direct_IO = ext4_direct_IO, | 3846 | .direct_IO = ext4_direct_IO, |
3389 | .migratepage = buffer_migrate_page, | 3847 | .migratepage = buffer_migrate_page, |
3390 | .is_partially_uptodate = block_is_partially_uptodate, | 3848 | .is_partially_uptodate = block_is_partially_uptodate, |
3849 | .error_remove_page = generic_error_remove_page, | ||
3391 | }; | 3850 | }; |
3392 | 3851 | ||
3393 | static const struct address_space_operations ext4_journalled_aops = { | 3852 | static const struct address_space_operations ext4_journalled_aops = { |
@@ -3402,6 +3861,7 @@ static const struct address_space_operations ext4_journalled_aops = { | |||
3402 | .invalidatepage = ext4_invalidatepage, | 3861 | .invalidatepage = ext4_invalidatepage, |
3403 | .releasepage = ext4_releasepage, | 3862 | .releasepage = ext4_releasepage, |
3404 | .is_partially_uptodate = block_is_partially_uptodate, | 3863 | .is_partially_uptodate = block_is_partially_uptodate, |
3864 | .error_remove_page = generic_error_remove_page, | ||
3405 | }; | 3865 | }; |
3406 | 3866 | ||
3407 | static const struct address_space_operations ext4_da_aops = { | 3867 | static const struct address_space_operations ext4_da_aops = { |
@@ -3418,6 +3878,7 @@ static const struct address_space_operations ext4_da_aops = { | |||
3418 | .direct_IO = ext4_direct_IO, | 3878 | .direct_IO = ext4_direct_IO, |
3419 | .migratepage = buffer_migrate_page, | 3879 | .migratepage = buffer_migrate_page, |
3420 | .is_partially_uptodate = block_is_partially_uptodate, | 3880 | .is_partially_uptodate = block_is_partially_uptodate, |
3881 | .error_remove_page = generic_error_remove_page, | ||
3421 | }; | 3882 | }; |
3422 | 3883 | ||
3423 | void ext4_set_aops(struct inode *inode) | 3884 | void ext4_set_aops(struct inode *inode) |
@@ -3659,7 +4120,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, | |||
3659 | ext4_handle_dirty_metadata(handle, inode, bh); | 4120 | ext4_handle_dirty_metadata(handle, inode, bh); |
3660 | } | 4121 | } |
3661 | ext4_mark_inode_dirty(handle, inode); | 4122 | ext4_mark_inode_dirty(handle, inode); |
3662 | ext4_journal_test_restart(handle, inode); | 4123 | ext4_truncate_restart_trans(handle, inode, |
4124 | blocks_for_truncate(inode)); | ||
3663 | if (bh) { | 4125 | if (bh) { |
3664 | BUFFER_TRACE(bh, "retaking write access"); | 4126 | BUFFER_TRACE(bh, "retaking write access"); |
3665 | ext4_journal_get_write_access(handle, bh); | 4127 | ext4_journal_get_write_access(handle, bh); |
@@ -3870,7 +4332,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3870 | return; | 4332 | return; |
3871 | if (try_to_extend_transaction(handle, inode)) { | 4333 | if (try_to_extend_transaction(handle, inode)) { |
3872 | ext4_mark_inode_dirty(handle, inode); | 4334 | ext4_mark_inode_dirty(handle, inode); |
3873 | ext4_journal_test_restart(handle, inode); | 4335 | ext4_truncate_restart_trans(handle, inode, |
4336 | blocks_for_truncate(inode)); | ||
3874 | } | 4337 | } |
3875 | 4338 | ||
3876 | ext4_free_blocks(handle, inode, nr, 1, 1); | 4339 | ext4_free_blocks(handle, inode, nr, 1, 1); |
@@ -3958,8 +4421,7 @@ void ext4_truncate(struct inode *inode) | |||
3958 | if (!ext4_can_truncate(inode)) | 4421 | if (!ext4_can_truncate(inode)) |
3959 | return; | 4422 | return; |
3960 | 4423 | ||
3961 | if (ei->i_disksize && inode->i_size == 0 && | 4424 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
3962 | !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | ||
3963 | ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; | 4425 | ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; |
3964 | 4426 | ||
3965 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | 4427 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { |
@@ -4581,8 +5043,7 @@ static int ext4_do_update_inode(handle_t *handle, | |||
4581 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) | 5043 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) |
4582 | goto out_brelse; | 5044 | goto out_brelse; |
4583 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); | 5045 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); |
4584 | /* clear the migrate flag in the raw_inode */ | 5046 | raw_inode->i_flags = cpu_to_le32(ei->i_flags); |
4585 | raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); | ||
4586 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != | 5047 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != |
4587 | cpu_to_le32(EXT4_OS_HURD)) | 5048 | cpu_to_le32(EXT4_OS_HURD)) |
4588 | raw_inode->i_file_acl_high = | 5049 | raw_inode->i_file_acl_high = |
@@ -4684,19 +5145,40 @@ out_brelse: | |||
4684 | */ | 5145 | */ |
4685 | int ext4_write_inode(struct inode *inode, int wait) | 5146 | int ext4_write_inode(struct inode *inode, int wait) |
4686 | { | 5147 | { |
5148 | int err; | ||
5149 | |||
4687 | if (current->flags & PF_MEMALLOC) | 5150 | if (current->flags & PF_MEMALLOC) |
4688 | return 0; | 5151 | return 0; |
4689 | 5152 | ||
4690 | if (ext4_journal_current_handle()) { | 5153 | if (EXT4_SB(inode->i_sb)->s_journal) { |
4691 | jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); | 5154 | if (ext4_journal_current_handle()) { |
4692 | dump_stack(); | 5155 | jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); |
4693 | return -EIO; | 5156 | dump_stack(); |
4694 | } | 5157 | return -EIO; |
5158 | } | ||
4695 | 5159 | ||
4696 | if (!wait) | 5160 | if (!wait) |
4697 | return 0; | 5161 | return 0; |
4698 | 5162 | ||
4699 | return ext4_force_commit(inode->i_sb); | 5163 | err = ext4_force_commit(inode->i_sb); |
5164 | } else { | ||
5165 | struct ext4_iloc iloc; | ||
5166 | |||
5167 | err = ext4_get_inode_loc(inode, &iloc); | ||
5168 | if (err) | ||
5169 | return err; | ||
5170 | if (wait) | ||
5171 | sync_dirty_buffer(iloc.bh); | ||
5172 | if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { | ||
5173 | ext4_error(inode->i_sb, __func__, | ||
5174 | "IO error syncing inode, " | ||
5175 | "inode=%lu, block=%llu", | ||
5176 | inode->i_ino, | ||
5177 | (unsigned long long)iloc.bh->b_blocknr); | ||
5178 | err = -EIO; | ||
5179 | } | ||
5180 | } | ||
5181 | return err; | ||
4700 | } | 5182 | } |
4701 | 5183 | ||
4702 | /* | 5184 | /* |
@@ -5134,27 +5616,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | |||
5134 | */ | 5616 | */ |
5135 | void ext4_dirty_inode(struct inode *inode) | 5617 | void ext4_dirty_inode(struct inode *inode) |
5136 | { | 5618 | { |
5137 | handle_t *current_handle = ext4_journal_current_handle(); | ||
5138 | handle_t *handle; | 5619 | handle_t *handle; |
5139 | 5620 | ||
5140 | if (!ext4_handle_valid(current_handle)) { | ||
5141 | ext4_mark_inode_dirty(current_handle, inode); | ||
5142 | return; | ||
5143 | } | ||
5144 | |||
5145 | handle = ext4_journal_start(inode, 2); | 5621 | handle = ext4_journal_start(inode, 2); |
5146 | if (IS_ERR(handle)) | 5622 | if (IS_ERR(handle)) |
5147 | goto out; | 5623 | goto out; |
5148 | if (current_handle && | 5624 | |
5149 | current_handle->h_transaction != handle->h_transaction) { | 5625 | ext4_mark_inode_dirty(handle, inode); |
5150 | /* This task has a transaction open against a different fs */ | 5626 | |
5151 | printk(KERN_EMERG "%s: transactions do not match!\n", | ||
5152 | __func__); | ||
5153 | } else { | ||
5154 | jbd_debug(5, "marking dirty. outer handle=%p\n", | ||
5155 | current_handle); | ||
5156 | ext4_mark_inode_dirty(handle, inode); | ||
5157 | } | ||
5158 | ext4_journal_stop(handle); | 5627 | ext4_journal_stop(handle); |
5159 | out: | 5628 | out: |
5160 | return; | 5629 | return; |
@@ -5281,12 +5750,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
5281 | else | 5750 | else |
5282 | len = PAGE_CACHE_SIZE; | 5751 | len = PAGE_CACHE_SIZE; |
5283 | 5752 | ||
5753 | lock_page(page); | ||
5754 | /* | ||
5755 | * return if we have all the buffers mapped. This avoid | ||
5756 | * the need to call write_begin/write_end which does a | ||
5757 | * journal_start/journal_stop which can block and take | ||
5758 | * long time | ||
5759 | */ | ||
5284 | if (page_has_buffers(page)) { | 5760 | if (page_has_buffers(page)) { |
5285 | /* return if we have all the buffers mapped */ | ||
5286 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | 5761 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, |
5287 | ext4_bh_unmapped)) | 5762 | ext4_bh_unmapped)) { |
5763 | unlock_page(page); | ||
5288 | goto out_unlock; | 5764 | goto out_unlock; |
5765 | } | ||
5289 | } | 5766 | } |
5767 | unlock_page(page); | ||
5290 | /* | 5768 | /* |
5291 | * OK, we need to fill the hole... Do write_begin write_end | 5769 | * OK, we need to fill the hole... Do write_begin write_end |
5292 | * to do block allocation/reservation.We are not holding | 5770 | * to do block allocation/reservation.We are not holding |