aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c1655
1 files changed, 1392 insertions, 263 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..9843b046c235 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <linux/pagevec.h>
35#include <linux/mpage.h> 36#include <linux/mpage.h>
36#include <linux/uio.h> 37#include <linux/uio.h>
37#include <linux/bio.h> 38#include <linux/bio.h>
38#include "ext4_jbd2.h" 39#include "ext4_jbd2.h"
39#include "xattr.h" 40#include "xattr.h"
40#include "acl.h" 41#include "acl.h"
42#include "ext4_extents.h"
43
44static inline int ext4_begin_ordered_truncate(struct inode *inode,
45 loff_t new_size)
46{
47 return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
48 new_size);
49}
50
51static void ext4_invalidatepage(struct page *page, unsigned long offset);
41 52
42/* 53/*
43 * Test whether an inode is a fast symlink. 54 * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
181{ 192{
182 handle_t *handle; 193 handle_t *handle;
183 194
195 if (ext4_should_order_data(inode))
196 ext4_begin_ordered_truncate(inode, 0);
184 truncate_inode_pages(&inode->i_data, 0); 197 truncate_inode_pages(&inode->i_data, 0);
185 198
186 if (is_bad_inode(inode)) 199 if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
508 * direct blocks 521 * direct blocks
509 */ 522 */
510static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 523static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
511 ext4_fsblk_t goal, int indirect_blks, int blks, 524 ext4_lblk_t iblock, ext4_fsblk_t goal,
512 ext4_fsblk_t new_blocks[4], int *err) 525 int indirect_blks, int blks,
526 ext4_fsblk_t new_blocks[4], int *err)
513{ 527{
514 int target, i; 528 int target, i;
515 unsigned long count = 0; 529 unsigned long count = 0, blk_allocated = 0;
516 int index = 0; 530 int index = 0;
517 ext4_fsblk_t current_block = 0; 531 ext4_fsblk_t current_block = 0;
518 int ret = 0; 532 int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
525 * the first direct block of this branch. That's the 539 * the first direct block of this branch. That's the
526 * minimum number of blocks need to allocate(required) 540 * minimum number of blocks need to allocate(required)
527 */ 541 */
528 target = blks + indirect_blks; 542 /* first we try to allocate the indirect blocks */
529 543 target = indirect_blks;
530 while (1) { 544 while (target > 0) {
531 count = target; 545 count = target;
532 /* allocating blocks for indirect blocks and direct blocks */ 546 /* allocating blocks for indirect blocks and direct blocks */
533 current_block = ext4_new_blocks(handle,inode,goal,&count,err); 547 current_block = ext4_new_meta_blocks(handle, inode,
548 goal, &count, err);
534 if (*err) 549 if (*err)
535 goto failed_out; 550 goto failed_out;
536 551
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
540 new_blocks[index++] = current_block++; 555 new_blocks[index++] = current_block++;
541 count--; 556 count--;
542 } 557 }
543 558 if (count > 0) {
544 if (count > 0) 559 /*
560 * save the new block number
561 * for the first direct block
562 */
563 new_blocks[index] = current_block;
564 printk(KERN_INFO "%s returned more blocks than "
565 "requested\n", __func__);
566 WARN_ON(1);
545 break; 567 break;
568 }
546 } 569 }
547 570
548 /* save the new block number for the first direct block */ 571 target = blks - count ;
549 new_blocks[index] = current_block; 572 blk_allocated = count;
550 573 if (!target)
574 goto allocated;
575 /* Now allocate data blocks */
576 count = target;
577 /* allocating blocks for data blocks */
578 current_block = ext4_new_blocks(handle, inode, iblock,
579 goal, &count, err);
580 if (*err && (target == blks)) {
581 /*
582 * if the allocation failed and we didn't allocate
583 * any blocks before
584 */
585 goto failed_out;
586 }
587 if (!*err) {
588 if (target == blks) {
589 /*
590 * save the new block number
591 * for the first direct block
592 */
593 new_blocks[index] = current_block;
594 }
595 blk_allocated += count;
596 }
597allocated:
551 /* total number of blocks allocated for direct blocks */ 598 /* total number of blocks allocated for direct blocks */
552 ret = count; 599 ret = blk_allocated;
553 *err = 0; 600 *err = 0;
554 return ret; 601 return ret;
555failed_out: 602failed_out:
@@ -584,8 +631,9 @@ failed_out:
584 * as described above and return 0. 631 * as described above and return 0.
585 */ 632 */
586static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 633static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
587 int indirect_blks, int *blks, ext4_fsblk_t goal, 634 ext4_lblk_t iblock, int indirect_blks,
588 ext4_lblk_t *offsets, Indirect *branch) 635 int *blks, ext4_fsblk_t goal,
636 ext4_lblk_t *offsets, Indirect *branch)
589{ 637{
590 int blocksize = inode->i_sb->s_blocksize; 638 int blocksize = inode->i_sb->s_blocksize;
591 int i, n = 0; 639 int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
595 ext4_fsblk_t new_blocks[4]; 643 ext4_fsblk_t new_blocks[4];
596 ext4_fsblk_t current_block; 644 ext4_fsblk_t current_block;
597 645
598 num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, 646 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
599 *blks, new_blocks, &err); 647 *blks, new_blocks, &err);
600 if (err) 648 if (err)
601 return err; 649 return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
799 struct ext4_inode_info *ei = EXT4_I(inode); 847 struct ext4_inode_info *ei = EXT4_I(inode);
800 int count = 0; 848 int count = 0;
801 ext4_fsblk_t first_block = 0; 849 ext4_fsblk_t first_block = 0;
850 loff_t disksize;
802 851
803 852
804 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 853 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
855 /* 904 /*
856 * Block out ext4_truncate while we alter the tree 905 * Block out ext4_truncate while we alter the tree
857 */ 906 */
858 err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, 907 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
859 offsets + (partial - chain), partial); 908 &count, goal,
909 offsets + (partial - chain), partial);
860 910
861 /* 911 /*
862 * The ext4_splice_branch call will free and forget any buffers 912 * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
873 * protect it if you're about to implement concurrent 923 * protect it if you're about to implement concurrent
874 * ext4_get_block() -bzzz 924 * ext4_get_block() -bzzz
875 */ 925 */
876 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 926 if (!err && extend_disksize) {
877 ei->i_disksize = inode->i_size; 927 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
928 if (disksize > i_size_read(inode))
929 disksize = i_size_read(inode);
930 if (disksize > ei->i_disksize)
931 ei->i_disksize = disksize;
932 }
878 if (err) 933 if (err)
879 goto cleanup; 934 goto cleanup;
880 935
@@ -934,7 +989,7 @@ out:
934 */ 989 */
935int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 990int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
936 unsigned long max_blocks, struct buffer_head *bh, 991 unsigned long max_blocks, struct buffer_head *bh,
937 int create, int extend_disksize) 992 int create, int extend_disksize, int flag)
938{ 993{
939 int retval; 994 int retval;
940 995
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
975 * with create == 1 flag. 1030 * with create == 1 flag.
976 */ 1031 */
977 down_write((&EXT4_I(inode)->i_data_sem)); 1032 down_write((&EXT4_I(inode)->i_data_sem));
1033
1034 /*
1035 * if the caller is from delayed allocation writeout path
1036 * we have already reserved fs blocks for allocation
1037 * let the underlying get_block() function know to
1038 * avoid double accounting
1039 */
1040 if (flag)
1041 EXT4_I(inode)->i_delalloc_reserved_flag = 1;
978 /* 1042 /*
979 * We need to check for EXT4 here because migrate 1043 * We need to check for EXT4 here because migrate
980 * could have changed the inode type in between 1044 * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
996 ~EXT4_EXT_MIGRATE; 1060 ~EXT4_EXT_MIGRATE;
997 } 1061 }
998 } 1062 }
1063
1064 if (flag) {
1065 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1066 /*
1067 * Update reserved blocks/metadata blocks
1068 * after successful block allocation
1069 * which were deferred till now
1070 */
1071 if ((retval > 0) && buffer_delay(bh))
1072 ext4_da_release_space(inode, retval, 0);
1073 }
1074
999 up_write((&EXT4_I(inode)->i_data_sem)); 1075 up_write((&EXT4_I(inode)->i_data_sem));
1000 return retval; 1076 return retval;
1001} 1077}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
1021 } 1097 }
1022 1098
1023 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1099 ret = ext4_get_blocks_wrap(handle, inode, iblock,
1024 max_blocks, bh_result, create, 0); 1100 max_blocks, bh_result, create, 0, 0);
1025 if (ret > 0) { 1101 if (ret > 0) {
1026 bh_result->b_size = (ret << inode->i_blkbits); 1102 bh_result->b_size = (ret << inode->i_blkbits);
1027 ret = 0; 1103 ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1047 dummy.b_blocknr = -1000; 1123 dummy.b_blocknr = -1000;
1048 buffer_trace_init(&dummy.b_history); 1124 buffer_trace_init(&dummy.b_history);
1049 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1125 err = ext4_get_blocks_wrap(handle, inode, block, 1,
1050 &dummy, create, 1); 1126 &dummy, create, 1, 0);
1051 /* 1127 /*
1052 * ext4_get_blocks_handle() returns number of blocks 1128 * ext4_get_blocks_handle() returns number of blocks
1053 * mapped. 0 in case of a HOLE. 1129 * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1203 to = from + len; 1279 to = from + len;
1204 1280
1205retry: 1281retry:
1206 page = __grab_cache_page(mapping, index);
1207 if (!page)
1208 return -ENOMEM;
1209 *pagep = page;
1210
1211 handle = ext4_journal_start(inode, needed_blocks); 1282 handle = ext4_journal_start(inode, needed_blocks);
1212 if (IS_ERR(handle)) { 1283 if (IS_ERR(handle)) {
1213 unlock_page(page);
1214 page_cache_release(page);
1215 ret = PTR_ERR(handle); 1284 ret = PTR_ERR(handle);
1216 goto out; 1285 goto out;
1217 } 1286 }
1218 1287
1288 page = __grab_cache_page(mapping, index);
1289 if (!page) {
1290 ext4_journal_stop(handle);
1291 ret = -ENOMEM;
1292 goto out;
1293 }
1294 *pagep = page;
1295
1219 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1296 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1220 ext4_get_block); 1297 ext4_get_block);
1221 1298
@@ -1225,8 +1302,8 @@ retry:
1225 } 1302 }
1226 1303
1227 if (ret) { 1304 if (ret) {
1228 ext4_journal_stop(handle);
1229 unlock_page(page); 1305 unlock_page(page);
1306 ext4_journal_stop(handle);
1230 page_cache_release(page); 1307 page_cache_release(page);
1231 } 1308 }
1232 1309
@@ -1236,15 +1313,6 @@ out:
1236 return ret; 1313 return ret;
1237} 1314}
1238 1315
1239int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1240{
1241 int err = jbd2_journal_dirty_data(handle, bh);
1242 if (err)
1243 ext4_journal_abort_handle(__func__, __func__,
1244 bh, handle, err);
1245 return err;
1246}
1247
1248/* For write_end() in data=journal mode */ 1316/* For write_end() in data=journal mode */
1249static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1317static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1250{ 1318{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1255} 1323}
1256 1324
1257/* 1325/*
1258 * Generic write_end handler for ordered and writeback ext4 journal modes.
1259 * We can't use generic_write_end, because that unlocks the page and we need to
1260 * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
1261 * after block_write_end.
1262 */
1263static int ext4_generic_write_end(struct file *file,
1264 struct address_space *mapping,
1265 loff_t pos, unsigned len, unsigned copied,
1266 struct page *page, void *fsdata)
1267{
1268 struct inode *inode = file->f_mapping->host;
1269
1270 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1271
1272 if (pos+copied > inode->i_size) {
1273 i_size_write(inode, pos+copied);
1274 mark_inode_dirty(inode);
1275 }
1276
1277 return copied;
1278}
1279
1280/*
1281 * We need to pick up the new inode size which generic_commit_write gave us 1326 * We need to pick up the new inode size which generic_commit_write gave us
1282 * `file' can be NULL - eg, when called from page_symlink(). 1327 * `file' can be NULL - eg, when called from page_symlink().
1283 * 1328 *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
1290 struct page *page, void *fsdata) 1335 struct page *page, void *fsdata)
1291{ 1336{
1292 handle_t *handle = ext4_journal_current_handle(); 1337 handle_t *handle = ext4_journal_current_handle();
1293 struct inode *inode = file->f_mapping->host; 1338 struct inode *inode = mapping->host;
1294 unsigned from, to; 1339 unsigned from, to;
1295 int ret = 0, ret2; 1340 int ret = 0, ret2;
1296 1341
1297 from = pos & (PAGE_CACHE_SIZE - 1); 1342 from = pos & (PAGE_CACHE_SIZE - 1);
1298 to = from + len; 1343 to = from + len;
1299 1344
1300 ret = walk_page_buffers(handle, page_buffers(page), 1345 ret = ext4_jbd2_file_inode(handle, inode);
1301 from, to, NULL, ext4_journal_dirty_data);
1302 1346
1303 if (ret == 0) { 1347 if (ret == 0) {
1304 /* 1348 /*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
1311 new_i_size = pos + copied; 1355 new_i_size = pos + copied;
1312 if (new_i_size > EXT4_I(inode)->i_disksize) 1356 if (new_i_size > EXT4_I(inode)->i_disksize)
1313 EXT4_I(inode)->i_disksize = new_i_size; 1357 EXT4_I(inode)->i_disksize = new_i_size;
1314 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1358 ret2 = generic_write_end(file, mapping, pos, len, copied,
1315 page, fsdata); 1359 page, fsdata);
1316 copied = ret2; 1360 copied = ret2;
1317 if (ret2 < 0) 1361 if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
1320 ret2 = ext4_journal_stop(handle); 1364 ret2 = ext4_journal_stop(handle);
1321 if (!ret) 1365 if (!ret)
1322 ret = ret2; 1366 ret = ret2;
1323 unlock_page(page);
1324 page_cache_release(page);
1325 1367
1326 return ret ? ret : copied; 1368 return ret ? ret : copied;
1327} 1369}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
1332 struct page *page, void *fsdata) 1374 struct page *page, void *fsdata)
1333{ 1375{
1334 handle_t *handle = ext4_journal_current_handle(); 1376 handle_t *handle = ext4_journal_current_handle();
1335 struct inode *inode = file->f_mapping->host; 1377 struct inode *inode = mapping->host;
1336 int ret = 0, ret2; 1378 int ret = 0, ret2;
1337 loff_t new_i_size; 1379 loff_t new_i_size;
1338 1380
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
1340 if (new_i_size > EXT4_I(inode)->i_disksize) 1382 if (new_i_size > EXT4_I(inode)->i_disksize)
1341 EXT4_I(inode)->i_disksize = new_i_size; 1383 EXT4_I(inode)->i_disksize = new_i_size;
1342 1384
1343 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1385 ret2 = generic_write_end(file, mapping, pos, len, copied,
1344 page, fsdata); 1386 page, fsdata);
1345 copied = ret2; 1387 copied = ret2;
1346 if (ret2 < 0) 1388 if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
1349 ret2 = ext4_journal_stop(handle); 1391 ret2 = ext4_journal_stop(handle);
1350 if (!ret) 1392 if (!ret)
1351 ret = ret2; 1393 ret = ret2;
1352 unlock_page(page);
1353 page_cache_release(page);
1354 1394
1355 return ret ? ret : copied; 1395 return ret ? ret : copied;
1356} 1396}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
1389 ret = ret2; 1429 ret = ret2;
1390 } 1430 }
1391 1431
1432 unlock_page(page);
1392 ret2 = ext4_journal_stop(handle); 1433 ret2 = ext4_journal_stop(handle);
1393 if (!ret) 1434 if (!ret)
1394 ret = ret2; 1435 ret = ret2;
1395 unlock_page(page);
1396 page_cache_release(page); 1436 page_cache_release(page);
1397 1437
1398 return ret ? ret : copied; 1438 return ret ? ret : copied;
1399} 1439}
1440/*
1441 * Calculate the number of metadata blocks need to reserve
1442 * to allocate @blocks for non extent file based file
1443 */
1444static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
1445{
1446 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1447 int ind_blks, dind_blks, tind_blks;
1448
1449 /* number of new indirect blocks needed */
1450 ind_blks = (blocks + icap - 1) / icap;
1451
1452 dind_blks = (ind_blks + icap - 1) / icap;
1453
1454 tind_blks = 1;
1455
1456 return ind_blks + dind_blks + tind_blks;
1457}
1458
1459/*
1460 * Calculate the number of metadata blocks need to reserve
1461 * to allocate given number of blocks
1462 */
1463static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1464{
1465 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1466 return ext4_ext_calc_metadata_amount(inode, blocks);
1467
1468 return ext4_indirect_calc_metadata_amount(inode, blocks);
1469}
1470
1471static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1472{
1473 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1474 unsigned long md_needed, mdblocks, total = 0;
1475
1476 /*
1477 * recalculate the amount of metadata blocks to reserve
1478 * in order to allocate nrblocks
1479 * worse case is one extent per block
1480 */
1481 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1482 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1483 mdblocks = ext4_calc_metadata_amount(inode, total);
1484 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
1485
1486 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1487 total = md_needed + nrblocks;
1488
1489 if (ext4_has_free_blocks(sbi, total) < total) {
1490 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1491 return -ENOSPC;
1492 }
1493
1494 /* reduce fs free blocks counter */
1495 percpu_counter_sub(&sbi->s_freeblocks_counter, total);
1496
1497 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1498 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1499
1500 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1501 return 0; /* success */
1502}
1503
1504void ext4_da_release_space(struct inode *inode, int used, int to_free)
1505{
1506 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1507 int total, mdb, mdb_free, release;
1508
1509 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1510 /* recalculate the number of metablocks still need to be reserved */
1511 total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
1512 mdb = ext4_calc_metadata_amount(inode, total);
1513
1514 /* figure out how many metablocks to release */
1515 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1516 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1517
1518 /* Account for allocated meta_blocks */
1519 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1520
1521 release = to_free + mdb_free;
1522
1523 /* update fs free blocks counter for truncate case */
1524 percpu_counter_add(&sbi->s_freeblocks_counter, release);
1525
1526 /* update per-inode reservations */
1527 BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
1528 EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
1529
1530 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1531 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1532 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1533 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1534}
1535
1536static void ext4_da_page_release_reservation(struct page *page,
1537 unsigned long offset)
1538{
1539 int to_release = 0;
1540 struct buffer_head *head, *bh;
1541 unsigned int curr_off = 0;
1542
1543 head = page_buffers(page);
1544 bh = head;
1545 do {
1546 unsigned int next_off = curr_off + bh->b_size;
1547
1548 if ((offset <= curr_off) && (buffer_delay(bh))) {
1549 to_release++;
1550 clear_buffer_delay(bh);
1551 }
1552 curr_off = next_off;
1553 } while ((bh = bh->b_this_page) != head);
1554 ext4_da_release_space(page->mapping->host, 0, to_release);
1555}
1556
1557/*
1558 * Delayed allocation stuff
1559 */
1560
1561struct mpage_da_data {
1562 struct inode *inode;
1563 struct buffer_head lbh; /* extent of blocks */
1564 unsigned long first_page, next_page; /* extent of pages */
1565 get_block_t *get_block;
1566 struct writeback_control *wbc;
1567};
1568
1569/*
1570 * mpage_da_submit_io - walks through extent of pages and try to write
1571 * them with __mpage_writepage()
1572 *
1573 * @mpd->inode: inode
1574 * @mpd->first_page: first page of the extent
1575 * @mpd->next_page: page after the last page of the extent
1576 * @mpd->get_block: the filesystem's block mapper function
1577 *
1578 * By the time mpage_da_submit_io() is called we expect all blocks
1579 * to be allocated. this may be wrong if allocation failed.
1580 *
1581 * As pages are already locked by write_cache_pages(), we can't use it
1582 */
1583static int mpage_da_submit_io(struct mpage_da_data *mpd)
1584{
1585 struct address_space *mapping = mpd->inode->i_mapping;
1586 struct mpage_data mpd_pp = {
1587 .bio = NULL,
1588 .last_block_in_bio = 0,
1589 .get_block = mpd->get_block,
1590 .use_writepage = 1,
1591 };
1592 int ret = 0, err, nr_pages, i;
1593 unsigned long index, end;
1594 struct pagevec pvec;
1595
1596 BUG_ON(mpd->next_page <= mpd->first_page);
1597
1598 pagevec_init(&pvec, 0);
1599 index = mpd->first_page;
1600 end = mpd->next_page - 1;
1601
1602 while (index <= end) {
1603 /* XXX: optimize tail */
1604 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1605 if (nr_pages == 0)
1606 break;
1607 for (i = 0; i < nr_pages; i++) {
1608 struct page *page = pvec.pages[i];
1609
1610 index = page->index;
1611 if (index > end)
1612 break;
1613 index++;
1614
1615 err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
1616
1617 /*
1618 * In error case, we have to continue because
1619 * remaining pages are still locked
1620 * XXX: unlock and re-dirty them?
1621 */
1622 if (ret == 0)
1623 ret = err;
1624 }
1625 pagevec_release(&pvec);
1626 }
1627 if (mpd_pp.bio)
1628 mpage_bio_submit(WRITE, mpd_pp.bio);
1629
1630 return ret;
1631}
1632
1633/*
1634 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
1635 *
1636 * @mpd->inode - inode to walk through
1637 * @exbh->b_blocknr - first block on a disk
1638 * @exbh->b_size - amount of space in bytes
1639 * @logical - first logical block to start assignment with
1640 *
1641 * the function goes through all passed space and put actual disk
1642 * block numbers into buffer heads, dropping BH_Delay
1643 */
1644static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1645 struct buffer_head *exbh)
1646{
1647 struct inode *inode = mpd->inode;
1648 struct address_space *mapping = inode->i_mapping;
1649 int blocks = exbh->b_size >> inode->i_blkbits;
1650 sector_t pblock = exbh->b_blocknr, cur_logical;
1651 struct buffer_head *head, *bh;
1652 unsigned long index, end;
1653 struct pagevec pvec;
1654 int nr_pages, i;
1655
1656 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1657 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1658 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1659
1660 pagevec_init(&pvec, 0);
1661
1662 while (index <= end) {
1663 /* XXX: optimize tail */
1664 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1665 if (nr_pages == 0)
1666 break;
1667 for (i = 0; i < nr_pages; i++) {
1668 struct page *page = pvec.pages[i];
1669
1670 index = page->index;
1671 if (index > end)
1672 break;
1673 index++;
1674
1675 BUG_ON(!PageLocked(page));
1676 BUG_ON(PageWriteback(page));
1677 BUG_ON(!page_has_buffers(page));
1678
1679 bh = page_buffers(page);
1680 head = bh;
1681
1682 /* skip blocks out of the range */
1683 do {
1684 if (cur_logical >= logical)
1685 break;
1686 cur_logical++;
1687 } while ((bh = bh->b_this_page) != head);
1688
1689 do {
1690 if (cur_logical >= logical + blocks)
1691 break;
1692 if (buffer_delay(bh)) {
1693 bh->b_blocknr = pblock;
1694 clear_buffer_delay(bh);
1695 } else if (buffer_mapped(bh))
1696 BUG_ON(bh->b_blocknr != pblock);
1697
1698 cur_logical++;
1699 pblock++;
1700 } while ((bh = bh->b_this_page) != head);
1701 }
1702 pagevec_release(&pvec);
1703 }
1704}
1705
1706
1707/*
1708 * __unmap_underlying_blocks - just a helper function to unmap
1709 * set of blocks described by @bh
1710 */
1711static inline void __unmap_underlying_blocks(struct inode *inode,
1712 struct buffer_head *bh)
1713{
1714 struct block_device *bdev = inode->i_sb->s_bdev;
1715 int blocks, i;
1716
1717 blocks = bh->b_size >> inode->i_blkbits;
1718 for (i = 0; i < blocks; i++)
1719 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1720}
1721
1722/*
1723 * mpage_da_map_blocks - go through given space
1724 *
1725 * @mpd->lbh - bh describing space
1726 * @mpd->get_block - the filesystem's block mapper function
1727 *
1728 * The function skips space we know is already mapped to disk blocks.
1729 *
1730 * The function ignores errors ->get_block() returns, thus real
1731 * error handling is postponed to __mpage_writepage()
1732 */
1733static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1734{
1735 struct buffer_head *lbh = &mpd->lbh;
1736 int err = 0, remain = lbh->b_size;
1737 sector_t next = lbh->b_blocknr;
1738 struct buffer_head new;
1739
1740 /*
1741 * We consider only non-mapped and non-allocated blocks
1742 */
1743 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1744 return;
1745
1746 while (remain) {
1747 new.b_state = lbh->b_state;
1748 new.b_blocknr = 0;
1749 new.b_size = remain;
1750 err = mpd->get_block(mpd->inode, next, &new, 1);
1751 if (err) {
1752 /*
1753 * Rather than implement own error handling
1754 * here, we just leave remaining blocks
1755 * unallocated and try again with ->writepage()
1756 */
1757 break;
1758 }
1759 BUG_ON(new.b_size == 0);
1760
1761 if (buffer_new(&new))
1762 __unmap_underlying_blocks(mpd->inode, &new);
1763
1764 /*
1765 * If blocks are delayed marked, we need to
1766 * put actual blocknr and drop delayed bit
1767 */
1768 if (buffer_delay(lbh))
1769 mpage_put_bnr_to_bhs(mpd, next, &new);
1770
1771 /* go for the remaining blocks */
1772 next += new.b_size >> mpd->inode->i_blkbits;
1773 remain -= new.b_size;
1774 }
1775}
1776
1777#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
1778
1779/*
1780 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1781 *
1782 * @mpd->lbh - extent of blocks
1783 * @logical - logical number of the block in the file
1784 * @bh - bh of the block (used to access block's state)
1785 *
1786 * the function is used to collect contig. blocks in same state
1787 */
1788static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1789 sector_t logical, struct buffer_head *bh)
1790{
1791 struct buffer_head *lbh = &mpd->lbh;
1792 sector_t next;
1793
1794 next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
1795
1796 /*
1797 * First block in the extent
1798 */
1799 if (lbh->b_size == 0) {
1800 lbh->b_blocknr = logical;
1801 lbh->b_size = bh->b_size;
1802 lbh->b_state = bh->b_state & BH_FLAGS;
1803 return;
1804 }
1805
1806 /*
1807 * Can we merge the block to our big extent?
1808 */
1809 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1810 lbh->b_size += bh->b_size;
1811 return;
1812 }
1813
1814 /*
1815 * We couldn't merge the block to our extent, so we
1816 * need to flush current extent and start new one
1817 */
1818 mpage_da_map_blocks(mpd);
1819
1820 /*
1821 * Now start a new extent
1822 */
1823 lbh->b_size = bh->b_size;
1824 lbh->b_state = bh->b_state & BH_FLAGS;
1825 lbh->b_blocknr = logical;
1826}
1827
1828/*
1829 * __mpage_da_writepage - finds extent of pages and blocks
1830 *
1831 * @page: page to consider
1832 * @wbc: not used, we just follow rules
1833 * @data: context
1834 *
1835 * The function finds extents of pages and scan them for all blocks.
1836 */
1837static int __mpage_da_writepage(struct page *page,
1838 struct writeback_control *wbc, void *data)
1839{
1840 struct mpage_da_data *mpd = data;
1841 struct inode *inode = mpd->inode;
1842 struct buffer_head *bh, *head, fake;
1843 sector_t logical;
1844
1845 /*
1846 * Can we merge this page to current extent?
1847 */
1848 if (mpd->next_page != page->index) {
1849 /*
1850 * Nope, we can't. So, we map non-allocated blocks
1851 * and start IO on them using __mpage_writepage()
1852 */
1853 if (mpd->next_page != mpd->first_page) {
1854 mpage_da_map_blocks(mpd);
1855 mpage_da_submit_io(mpd);
1856 }
1857
1858 /*
1859 * Start next extent of pages ...
1860 */
1861 mpd->first_page = page->index;
1862
1863 /*
1864 * ... and blocks
1865 */
1866 mpd->lbh.b_size = 0;
1867 mpd->lbh.b_state = 0;
1868 mpd->lbh.b_blocknr = 0;
1869 }
1870
1871 mpd->next_page = page->index + 1;
1872 logical = (sector_t) page->index <<
1873 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1874
1875 if (!page_has_buffers(page)) {
1876 /*
1877 * There is no attached buffer heads yet (mmap?)
1878 * we treat the page asfull of dirty blocks
1879 */
1880 bh = &fake;
1881 bh->b_size = PAGE_CACHE_SIZE;
1882 bh->b_state = 0;
1883 set_buffer_dirty(bh);
1884 set_buffer_uptodate(bh);
1885 mpage_add_bh_to_extent(mpd, logical, bh);
1886 } else {
1887 /*
1888 * Page with regular buffer heads, just add all dirty ones
1889 */
1890 head = page_buffers(page);
1891 bh = head;
1892 do {
1893 BUG_ON(buffer_locked(bh));
1894 if (buffer_dirty(bh))
1895 mpage_add_bh_to_extent(mpd, logical, bh);
1896 logical++;
1897 } while ((bh = bh->b_this_page) != head);
1898 }
1899
1900 return 0;
1901}
1902
1903/*
1904 * mpage_da_writepages - walk the list of dirty pages of the given
1905 * address space, allocates non-allocated blocks, maps newly-allocated
1906 * blocks to existing bhs and issue IO them
1907 *
1908 * @mapping: address space structure to write
1909 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1910 * @get_block: the filesystem's block mapper function.
1911 *
1912 * This is a library function, which implements the writepages()
1913 * address_space_operation.
1914 *
1915 * In order to avoid duplication of logic that deals with partial pages,
1916 * multiple bio per page, etc, we find non-allocated blocks, allocate
1917 * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1918 *
1919 * It's important that we call __mpage_writepage() only once for each
1920 * involved page, otherwise we'd have to implement more complicated logic
1921 * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1922 *
1923 * See comments to mpage_writepages()
1924 */
1925static int mpage_da_writepages(struct address_space *mapping,
1926 struct writeback_control *wbc,
1927 get_block_t get_block)
1928{
1929 struct mpage_da_data mpd;
1930 int ret;
1931
1932 if (!get_block)
1933 return generic_writepages(mapping, wbc);
1934
1935 mpd.wbc = wbc;
1936 mpd.inode = mapping->host;
1937 mpd.lbh.b_size = 0;
1938 mpd.lbh.b_state = 0;
1939 mpd.lbh.b_blocknr = 0;
1940 mpd.first_page = 0;
1941 mpd.next_page = 0;
1942 mpd.get_block = get_block;
1943
1944 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
1945
1946 /*
1947 * Handle last extent of pages
1948 */
1949 if (mpd.next_page != mpd.first_page) {
1950 mpage_da_map_blocks(&mpd);
1951 mpage_da_submit_io(&mpd);
1952 }
1953
1954 return ret;
1955}
1956
1957/*
1958 * this is a special callback for ->write_begin() only
1959 * it's intention is to return mapped block or reserve space
1960 */
1961static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1962 struct buffer_head *bh_result, int create)
1963{
1964 int ret = 0;
1965
1966 BUG_ON(create == 0);
1967 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1968
1969 /*
1970 * first, we need to know whether the block is allocated already
1971 * preallocated blocks are unmapped but should treated
1972 * the same as allocated blocks.
1973 */
1974 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
1975 if ((ret == 0) && !buffer_delay(bh_result)) {
1976 /* the block isn't (pre)allocated yet, let's reserve space */
1977 /*
1978 * XXX: __block_prepare_write() unmaps passed block,
1979 * is it OK?
1980 */
1981 ret = ext4_da_reserve_space(inode, 1);
1982 if (ret)
1983 /* not enough space to reserve */
1984 return ret;
1985
1986 map_bh(bh_result, inode->i_sb, 0);
1987 set_buffer_new(bh_result);
1988 set_buffer_delay(bh_result);
1989 } else if (ret > 0) {
1990 bh_result->b_size = (ret << inode->i_blkbits);
1991 ret = 0;
1992 }
1993
1994 return ret;
1995}
1996#define EXT4_DELALLOC_RSVED 1
1997static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1998 struct buffer_head *bh_result, int create)
1999{
2000 int ret;
2001 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2002 loff_t disksize = EXT4_I(inode)->i_disksize;
2003 handle_t *handle = NULL;
2004
2005 handle = ext4_journal_current_handle();
2006 if (!handle) {
2007 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2008 bh_result, 0, 0, 0);
2009 BUG_ON(!ret);
2010 } else {
2011 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2012 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2013 }
2014
2015 if (ret > 0) {
2016 bh_result->b_size = (ret << inode->i_blkbits);
2017
2018 /*
2019 * Update on-disk size along with block allocation
2020 * we don't use 'extend_disksize' as size may change
2021 * within already allocated block -bzzz
2022 */
2023 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2024 if (disksize > i_size_read(inode))
2025 disksize = i_size_read(inode);
2026 if (disksize > EXT4_I(inode)->i_disksize) {
2027 /*
2028 * XXX: replace with spinlock if seen contended -bzzz
2029 */
2030 down_write(&EXT4_I(inode)->i_data_sem);
2031 if (disksize > EXT4_I(inode)->i_disksize)
2032 EXT4_I(inode)->i_disksize = disksize;
2033 up_write(&EXT4_I(inode)->i_data_sem);
2034
2035 if (EXT4_I(inode)->i_disksize == disksize) {
2036 ret = ext4_mark_inode_dirty(handle, inode);
2037 return ret;
2038 }
2039 }
2040 ret = 0;
2041 }
2042 return ret;
2043}
2044
2045static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2046{
2047 /*
2048 * unmapped buffer is possible for holes.
2049 * delay buffer is possible with delayed allocation
2050 */
2051 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
2052}
2053
2054static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
2055 struct buffer_head *bh_result, int create)
2056{
2057 int ret = 0;
2058 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2059
2060 /*
2061 * we don't want to do block allocation in writepage
2062 * so call get_block_wrap with create = 0
2063 */
2064 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
2065 bh_result, 0, 0, 0);
2066 if (ret > 0) {
2067 bh_result->b_size = (ret << inode->i_blkbits);
2068 ret = 0;
2069 }
2070 return ret;
2071}
2072
2073/*
2074 * get called vi ext4_da_writepages after taking page lock (have journal handle)
2075 * get called via journal_submit_inode_data_buffers (no journal handle)
2076 * get called via shrink_page_list via pdflush (no journal handle)
2077 * or grab_page_cache when doing write_begin (have journal handle)
2078 */
2079static int ext4_da_writepage(struct page *page,
2080 struct writeback_control *wbc)
2081{
2082 int ret = 0;
2083 loff_t size;
2084 unsigned long len;
2085 struct buffer_head *page_bufs;
2086 struct inode *inode = page->mapping->host;
2087
2088 size = i_size_read(inode);
2089 if (page->index == size >> PAGE_CACHE_SHIFT)
2090 len = size & ~PAGE_CACHE_MASK;
2091 else
2092 len = PAGE_CACHE_SIZE;
2093
2094 if (page_has_buffers(page)) {
2095 page_bufs = page_buffers(page);
2096 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2097 ext4_bh_unmapped_or_delay)) {
2098 /*
2099 * We don't want to do block allocation
2100 * So redirty the page and return
2101 * We may reach here when we do a journal commit
2102 * via journal_submit_inode_data_buffers.
2103 * If we don't have mapping block we just ignore
2104 * them. We can also reach here via shrink_page_list
2105 */
2106 redirty_page_for_writepage(wbc, page);
2107 unlock_page(page);
2108 return 0;
2109 }
2110 } else {
2111 /*
2112 * The test for page_has_buffers() is subtle:
2113 * We know the page is dirty but it lost buffers. That means
2114 * that at some moment in time after write_begin()/write_end()
2115 * has been called all buffers have been clean and thus they
2116 * must have been written at least once. So they are all
2117 * mapped and we can happily proceed with mapping them
2118 * and writing the page.
2119 *
2120 * Try to initialize the buffer_heads and check whether
2121 * all are mapped and non delay. We don't want to
2122 * do block allocation here.
2123 */
2124 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
2125 ext4_normal_get_block_write);
2126 if (!ret) {
2127 page_bufs = page_buffers(page);
2128 /* check whether all are mapped and non delay */
2129 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2130 ext4_bh_unmapped_or_delay)) {
2131 redirty_page_for_writepage(wbc, page);
2132 unlock_page(page);
2133 return 0;
2134 }
2135 } else {
2136 /*
2137 * We can't do block allocation here
2138 * so just redity the page and unlock
2139 * and return
2140 */
2141 redirty_page_for_writepage(wbc, page);
2142 unlock_page(page);
2143 return 0;
2144 }
2145 }
2146
2147 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2148 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
2149 else
2150 ret = block_write_full_page(page,
2151 ext4_normal_get_block_write,
2152 wbc);
2153
2154 return ret;
2155}
2156
2157/*
2158 * For now just follow the DIO way to estimate the max credits
2159 * needed to write out EXT4_MAX_WRITEBACK_PAGES.
2160 * todo: need to calculate the max credits need for
2161 * extent based files, currently the DIO credits is based on
2162 * indirect-blocks mapping way.
2163 *
2164 * Probably should have a generic way to calculate credits
2165 * for DIO, writepages, and truncate
2166 */
2167#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
2168#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
2169
2170static int ext4_da_writepages(struct address_space *mapping,
2171 struct writeback_control *wbc)
2172{
2173 struct inode *inode = mapping->host;
2174 handle_t *handle = NULL;
2175 int needed_blocks;
2176 int ret = 0;
2177 long to_write;
2178 loff_t range_start = 0;
2179
2180 /*
2181 * No pages to write? This is mainly a kludge to avoid starting
2182 * a transaction for special inodes like journal inode on last iput()
2183 * because that could violate lock ordering on umount
2184 */
2185 if (!mapping->nrpages)
2186 return 0;
2187
2188 /*
2189 * Estimate the worse case needed credits to write out
2190 * EXT4_MAX_BUF_BLOCKS pages
2191 */
2192 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
2193
2194 to_write = wbc->nr_to_write;
2195 if (!wbc->range_cyclic) {
2196 /*
2197 * If range_cyclic is not set force range_cont
2198 * and save the old writeback_index
2199 */
2200 wbc->range_cont = 1;
2201 range_start = wbc->range_start;
2202 }
2203
2204 while (!ret && to_write) {
2205 /* start a new transaction*/
2206 handle = ext4_journal_start(inode, needed_blocks);
2207 if (IS_ERR(handle)) {
2208 ret = PTR_ERR(handle);
2209 goto out_writepages;
2210 }
2211 if (ext4_should_order_data(inode)) {
2212 /*
2213 * With ordered mode we need to add
2214 * the inode to the journal handle
2215 * when we do block allocation.
2216 */
2217 ret = ext4_jbd2_file_inode(handle, inode);
2218 if (ret) {
2219 ext4_journal_stop(handle);
2220 goto out_writepages;
2221 }
2222
2223 }
2224 /*
2225 * set the max dirty pages could be write at a time
2226 * to fit into the reserved transaction credits
2227 */
2228 if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
2229 wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
2230
2231 to_write -= wbc->nr_to_write;
2232 ret = mpage_da_writepages(mapping, wbc,
2233 ext4_da_get_block_write);
2234 ext4_journal_stop(handle);
2235 if (wbc->nr_to_write) {
2236 /*
2237 * There is no more writeout needed
2238 * or we requested for a noblocking writeout
2239 * and we found the device congested
2240 */
2241 to_write += wbc->nr_to_write;
2242 break;
2243 }
2244 wbc->nr_to_write = to_write;
2245 }
2246
2247out_writepages:
2248 wbc->nr_to_write = to_write;
2249 if (range_start)
2250 wbc->range_start = range_start;
2251 return ret;
2252}
2253
2254static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2255 loff_t pos, unsigned len, unsigned flags,
2256 struct page **pagep, void **fsdata)
2257{
2258 int ret, retries = 0;
2259 struct page *page;
2260 pgoff_t index;
2261 unsigned from, to;
2262 struct inode *inode = mapping->host;
2263 handle_t *handle;
2264
2265 index = pos >> PAGE_CACHE_SHIFT;
2266 from = pos & (PAGE_CACHE_SIZE - 1);
2267 to = from + len;
2268
2269retry:
2270 /*
2271 * With delayed allocation, we don't log the i_disksize update
2272 * if there is delayed block allocation. But we still need
2273 * to journalling the i_disksize update if writes to the end
2274 * of file which has an already mapped buffer.
2275 */
2276 handle = ext4_journal_start(inode, 1);
2277 if (IS_ERR(handle)) {
2278 ret = PTR_ERR(handle);
2279 goto out;
2280 }
2281
2282 page = __grab_cache_page(mapping, index);
2283 if (!page)
2284 return -ENOMEM;
2285 *pagep = page;
2286
2287 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2288 ext4_da_get_block_prep);
2289 if (ret < 0) {
2290 unlock_page(page);
2291 ext4_journal_stop(handle);
2292 page_cache_release(page);
2293 }
2294
2295 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2296 goto retry;
2297out:
2298 return ret;
2299}
2300
2301/*
2302 * Check if we should update i_disksize
2303 * when write to the end of file but not require block allocation
2304 */
2305static int ext4_da_should_update_i_disksize(struct page *page,
2306 unsigned long offset)
2307{
2308 struct buffer_head *bh;
2309 struct inode *inode = page->mapping->host;
2310 unsigned int idx;
2311 int i;
2312
2313 bh = page_buffers(page);
2314 idx = offset >> inode->i_blkbits;
2315
2316 for (i=0; i < idx; i++)
2317 bh = bh->b_this_page;
2318
2319 if (!buffer_mapped(bh) || (buffer_delay(bh)))
2320 return 0;
2321 return 1;
2322}
2323
2324static int ext4_da_write_end(struct file *file,
2325 struct address_space *mapping,
2326 loff_t pos, unsigned len, unsigned copied,
2327 struct page *page, void *fsdata)
2328{
2329 struct inode *inode = mapping->host;
2330 int ret = 0, ret2;
2331 handle_t *handle = ext4_journal_current_handle();
2332 loff_t new_i_size;
2333 unsigned long start, end;
2334
2335 start = pos & (PAGE_CACHE_SIZE - 1);
2336 end = start + copied -1;
2337
2338 /*
2339 * generic_write_end() will run mark_inode_dirty() if i_size
2340 * changes. So let's piggyback the i_disksize mark_inode_dirty
2341 * into that.
2342 */
2343
2344 new_i_size = pos + copied;
2345 if (new_i_size > EXT4_I(inode)->i_disksize) {
2346 if (ext4_da_should_update_i_disksize(page, end)) {
2347 down_write(&EXT4_I(inode)->i_data_sem);
2348 if (new_i_size > EXT4_I(inode)->i_disksize) {
2349 /*
2350 * Updating i_disksize when extending file
2351 * without needing block allocation
2352 */
2353 if (ext4_should_order_data(inode))
2354 ret = ext4_jbd2_file_inode(handle,
2355 inode);
2356
2357 EXT4_I(inode)->i_disksize = new_i_size;
2358 }
2359 up_write(&EXT4_I(inode)->i_data_sem);
2360 }
2361 }
2362 ret2 = generic_write_end(file, mapping, pos, len, copied,
2363 page, fsdata);
2364 copied = ret2;
2365 if (ret2 < 0)
2366 ret = ret2;
2367 ret2 = ext4_journal_stop(handle);
2368 if (!ret)
2369 ret = ret2;
2370
2371 return ret ? ret : copied;
2372}
2373
2374static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2375{
2376 /*
2377 * Drop reserved blocks
2378 */
2379 BUG_ON(!PageLocked(page));
2380 if (!page_has_buffers(page))
2381 goto out;
2382
2383 ext4_da_page_release_reservation(page, offset);
2384
2385out:
2386 ext4_invalidatepage(page, offset);
2387
2388 return;
2389}
2390
1400 2391
1401/* 2392/*
1402 * bmap() is special. It gets used by applications such as lilo and by 2393 * bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
1418 journal_t *journal; 2409 journal_t *journal;
1419 int err; 2410 int err;
1420 2411
2412 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2413 test_opt(inode->i_sb, DELALLOC)) {
2414 /*
2415 * With delalloc we want to sync the file
2416 * so that we can make sure we allocate
2417 * blocks for file
2418 */
2419 filemap_write_and_wait(mapping);
2420 }
2421
1421 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2422 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
1422 /* 2423 /*
1423 * This is a REALLY heavyweight approach, but the use of 2424 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1462 return 0; 2463 return 0;
1463} 2464}
1464 2465
1465static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1466{
1467 if (buffer_mapped(bh))
1468 return ext4_journal_dirty_data(handle, bh);
1469 return 0;
1470}
1471
1472/* 2466/*
1473 * Note that we always start a transaction even if we're not journalling 2467 * Note that we don't need to start a transaction unless we're journaling data
1474 * data. This is to preserve ordering: any hole instantiation within 2468 * because we should have holes filled from ext4_page_mkwrite(). We even don't
1475 * __block_write_full_page -> ext4_get_block() should be journalled 2469 * need to file the inode to the transaction's list in ordered mode because if
1476 * along with the data so we don't crash and then get metadata which 2470 * we are writing back data added by write(), the inode is already there and if
1477 * refers to old data. 2471 * we are writing back data modified via mmap(), noone guarantees in which
2472 * transaction the data will hit the disk. In case we are journaling data, we
2473 * cannot start transaction directly because transaction start ranks above page
2474 * lock so we have to do some magic.
1478 * 2475 *
1479 * In all journalling modes block_write_full_page() will start the I/O. 2476 * In all journaling modes block_write_full_page() will start the I/O.
1480 * 2477 *
1481 * Problem: 2478 * Problem:
1482 * 2479 *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1518 * disastrous. Any write() or metadata operation will sync the fs for 2515 * disastrous. Any write() or metadata operation will sync the fs for
1519 * us. 2516 * us.
1520 * 2517 *
1521 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1522 * we don't need to open a transaction here.
1523 */ 2518 */
1524static int ext4_ordered_writepage(struct page *page, 2519static int __ext4_normal_writepage(struct page *page,
1525 struct writeback_control *wbc) 2520 struct writeback_control *wbc)
1526{ 2521{
1527 struct inode *inode = page->mapping->host; 2522 struct inode *inode = page->mapping->host;
1528 struct buffer_head *page_bufs;
1529 handle_t *handle = NULL;
1530 int ret = 0;
1531 int err;
1532
1533 J_ASSERT(PageLocked(page));
1534 2523
1535 /* 2524 if (test_opt(inode->i_sb, NOBH))
1536 * We give up here if we're reentered, because it might be for a 2525 return nobh_writepage(page,
1537 * different filesystem. 2526 ext4_normal_get_block_write, wbc);
1538 */ 2527 else
1539 if (ext4_journal_current_handle()) 2528 return block_write_full_page(page,
1540 goto out_fail; 2529 ext4_normal_get_block_write,
1541 2530 wbc);
1542 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2531}
1543 2532
1544 if (IS_ERR(handle)) { 2533static int ext4_normal_writepage(struct page *page,
1545 ret = PTR_ERR(handle); 2534 struct writeback_control *wbc)
1546 goto out_fail; 2535{
1547 } 2536 struct inode *inode = page->mapping->host;
2537 loff_t size = i_size_read(inode);
2538 loff_t len;
1548 2539
1549 if (!page_has_buffers(page)) { 2540 J_ASSERT(PageLocked(page));
1550 create_empty_buffers(page, inode->i_sb->s_blocksize, 2541 if (page->index == size >> PAGE_CACHE_SHIFT)
1551 (1 << BH_Dirty)|(1 << BH_Uptodate)); 2542 len = size & ~PAGE_CACHE_MASK;
2543 else
2544 len = PAGE_CACHE_SIZE;
2545
2546 if (page_has_buffers(page)) {
2547 /* if page has buffers it should all be mapped
2548 * and allocated. If there are not buffers attached
2549 * to the page we know the page is dirty but it lost
2550 * buffers. That means that at some moment in time
2551 * after write_begin() / write_end() has been called
2552 * all buffers have been clean and thus they must have been
2553 * written at least once. So they are all mapped and we can
2554 * happily proceed with mapping them and writing the page.
2555 */
2556 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2557 ext4_bh_unmapped_or_delay));
1552 } 2558 }
1553 page_bufs = page_buffers(page);
1554 walk_page_buffers(handle, page_bufs, 0,
1555 PAGE_CACHE_SIZE, NULL, bget_one);
1556
1557 ret = block_write_full_page(page, ext4_get_block, wbc);
1558
1559 /*
1560 * The page can become unlocked at any point now, and
1561 * truncate can then come in and change things. So we
1562 * can't touch *page from now on. But *page_bufs is
1563 * safe due to elevated refcount.
1564 */
1565 2559
1566 /* 2560 if (!ext4_journal_current_handle())
1567 * And attach them to the current transaction. But only if 2561 return __ext4_normal_writepage(page, wbc);
1568 * block_write_full_page() succeeded. Otherwise they are unmapped,
1569 * and generally junk.
1570 */
1571 if (ret == 0) {
1572 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1573 NULL, jbd2_journal_dirty_data_fn);
1574 if (!ret)
1575 ret = err;
1576 }
1577 walk_page_buffers(handle, page_bufs, 0,
1578 PAGE_CACHE_SIZE, NULL, bput_one);
1579 err = ext4_journal_stop(handle);
1580 if (!ret)
1581 ret = err;
1582 return ret;
1583 2562
1584out_fail:
1585 redirty_page_for_writepage(wbc, page); 2563 redirty_page_for_writepage(wbc, page);
1586 unlock_page(page); 2564 unlock_page(page);
1587 return ret; 2565 return 0;
1588} 2566}
1589 2567
1590static int ext4_writeback_writepage(struct page *page, 2568static int __ext4_journalled_writepage(struct page *page,
1591 struct writeback_control *wbc) 2569 struct writeback_control *wbc)
1592{ 2570{
1593 struct inode *inode = page->mapping->host; 2571 struct address_space *mapping = page->mapping;
2572 struct inode *inode = mapping->host;
2573 struct buffer_head *page_bufs;
1594 handle_t *handle = NULL; 2574 handle_t *handle = NULL;
1595 int ret = 0; 2575 int ret = 0;
1596 int err; 2576 int err;
1597 2577
1598 if (ext4_journal_current_handle()) 2578 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1599 goto out_fail; 2579 ext4_normal_get_block_write);
2580 if (ret != 0)
2581 goto out_unlock;
2582
2583 page_bufs = page_buffers(page);
2584 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
2585 bget_one);
2586 /* As soon as we unlock the page, it can go away, but we have
2587 * references to buffers so we are safe */
2588 unlock_page(page);
1600 2589
1601 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2590 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1602 if (IS_ERR(handle)) { 2591 if (IS_ERR(handle)) {
1603 ret = PTR_ERR(handle); 2592 ret = PTR_ERR(handle);
1604 goto out_fail; 2593 goto out;
1605 } 2594 }
1606 2595
1607 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2596 ret = walk_page_buffers(handle, page_bufs, 0,
1608 ret = nobh_writepage(page, ext4_get_block, wbc); 2597 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1609 else
1610 ret = block_write_full_page(page, ext4_get_block, wbc);
1611 2598
2599 err = walk_page_buffers(handle, page_bufs, 0,
2600 PAGE_CACHE_SIZE, NULL, write_end_fn);
2601 if (ret == 0)
2602 ret = err;
1612 err = ext4_journal_stop(handle); 2603 err = ext4_journal_stop(handle);
1613 if (!ret) 2604 if (!ret)
1614 ret = err; 2605 ret = err;
1615 return ret;
1616 2606
1617out_fail: 2607 walk_page_buffers(handle, page_bufs, 0,
1618 redirty_page_for_writepage(wbc, page); 2608 PAGE_CACHE_SIZE, NULL, bput_one);
2609 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
2610 goto out;
2611
2612out_unlock:
1619 unlock_page(page); 2613 unlock_page(page);
2614out:
1620 return ret; 2615 return ret;
1621} 2616}
1622 2617
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
1624 struct writeback_control *wbc) 2619 struct writeback_control *wbc)
1625{ 2620{
1626 struct inode *inode = page->mapping->host; 2621 struct inode *inode = page->mapping->host;
1627 handle_t *handle = NULL; 2622 loff_t size = i_size_read(inode);
1628 int ret = 0; 2623 loff_t len;
1629 int err;
1630 2624
1631 if (ext4_journal_current_handle()) 2625 J_ASSERT(PageLocked(page));
1632 goto no_write; 2626 if (page->index == size >> PAGE_CACHE_SHIFT)
2627 len = size & ~PAGE_CACHE_MASK;
2628 else
2629 len = PAGE_CACHE_SIZE;
2630
2631 if (page_has_buffers(page)) {
2632 /* if page has buffers it should all be mapped
2633 * and allocated. If there are not buffers attached
2634 * to the page we know the page is dirty but it lost
2635 * buffers. That means that at some moment in time
2636 * after write_begin() / write_end() has been called
2637 * all buffers have been clean and thus they must have been
2638 * written at least once. So they are all mapped and we can
2639 * happily proceed with mapping them and writing the page.
2640 */
2641 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2642 ext4_bh_unmapped_or_delay));
2643 }
1633 2644
1634 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2645 if (ext4_journal_current_handle())
1635 if (IS_ERR(handle)) {
1636 ret = PTR_ERR(handle);
1637 goto no_write; 2646 goto no_write;
1638 }
1639 2647
1640 if (!page_has_buffers(page) || PageChecked(page)) { 2648 if (PageChecked(page)) {
1641 /* 2649 /*
1642 * It's mmapped pagecache. Add buffers and journal it. There 2650 * It's mmapped pagecache. Add buffers and journal it. There
1643 * doesn't seem much point in redirtying the page here. 2651 * doesn't seem much point in redirtying the page here.
1644 */ 2652 */
1645 ClearPageChecked(page); 2653 ClearPageChecked(page);
1646 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2654 return __ext4_journalled_writepage(page, wbc);
1647 ext4_get_block);
1648 if (ret != 0) {
1649 ext4_journal_stop(handle);
1650 goto out_unlock;
1651 }
1652 ret = walk_page_buffers(handle, page_buffers(page), 0,
1653 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1654
1655 err = walk_page_buffers(handle, page_buffers(page), 0,
1656 PAGE_CACHE_SIZE, NULL, write_end_fn);
1657 if (ret == 0)
1658 ret = err;
1659 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1660 unlock_page(page);
1661 } else { 2655 } else {
1662 /* 2656 /*
1663 * It may be a page full of checkpoint-mode buffers. We don't 2657 * It may be a page full of checkpoint-mode buffers. We don't
1664 * really know unless we go poke around in the buffer_heads. 2658 * really know unless we go poke around in the buffer_heads.
1665 * But block_write_full_page will do the right thing. 2659 * But block_write_full_page will do the right thing.
1666 */ 2660 */
1667 ret = block_write_full_page(page, ext4_get_block, wbc); 2661 return block_write_full_page(page,
2662 ext4_normal_get_block_write,
2663 wbc);
1668 } 2664 }
1669 err = ext4_journal_stop(handle);
1670 if (!ret)
1671 ret = err;
1672out:
1673 return ret;
1674
1675no_write: 2665no_write:
1676 redirty_page_for_writepage(wbc, page); 2666 redirty_page_for_writepage(wbc, page);
1677out_unlock:
1678 unlock_page(page); 2667 unlock_page(page);
1679 goto out; 2668 return 0;
1680} 2669}
1681 2670
1682static int ext4_readpage(struct file *file, struct page *page) 2671static int ext4_readpage(struct file *file, struct page *page)
@@ -1817,50 +2806,75 @@ static int ext4_journalled_set_page_dirty(struct page *page)
1817} 2806}
1818 2807
1819static const struct address_space_operations ext4_ordered_aops = { 2808static const struct address_space_operations ext4_ordered_aops = {
1820 .readpage = ext4_readpage, 2809 .readpage = ext4_readpage,
1821 .readpages = ext4_readpages, 2810 .readpages = ext4_readpages,
1822 .writepage = ext4_ordered_writepage, 2811 .writepage = ext4_normal_writepage,
1823 .sync_page = block_sync_page, 2812 .sync_page = block_sync_page,
1824 .write_begin = ext4_write_begin, 2813 .write_begin = ext4_write_begin,
1825 .write_end = ext4_ordered_write_end, 2814 .write_end = ext4_ordered_write_end,
1826 .bmap = ext4_bmap, 2815 .bmap = ext4_bmap,
1827 .invalidatepage = ext4_invalidatepage, 2816 .invalidatepage = ext4_invalidatepage,
1828 .releasepage = ext4_releasepage, 2817 .releasepage = ext4_releasepage,
1829 .direct_IO = ext4_direct_IO, 2818 .direct_IO = ext4_direct_IO,
1830 .migratepage = buffer_migrate_page, 2819 .migratepage = buffer_migrate_page,
2820 .is_partially_uptodate = block_is_partially_uptodate,
1831}; 2821};
1832 2822
1833static const struct address_space_operations ext4_writeback_aops = { 2823static const struct address_space_operations ext4_writeback_aops = {
1834 .readpage = ext4_readpage, 2824 .readpage = ext4_readpage,
1835 .readpages = ext4_readpages, 2825 .readpages = ext4_readpages,
1836 .writepage = ext4_writeback_writepage, 2826 .writepage = ext4_normal_writepage,
1837 .sync_page = block_sync_page, 2827 .sync_page = block_sync_page,
1838 .write_begin = ext4_write_begin, 2828 .write_begin = ext4_write_begin,
1839 .write_end = ext4_writeback_write_end, 2829 .write_end = ext4_writeback_write_end,
1840 .bmap = ext4_bmap, 2830 .bmap = ext4_bmap,
1841 .invalidatepage = ext4_invalidatepage, 2831 .invalidatepage = ext4_invalidatepage,
1842 .releasepage = ext4_releasepage, 2832 .releasepage = ext4_releasepage,
1843 .direct_IO = ext4_direct_IO, 2833 .direct_IO = ext4_direct_IO,
1844 .migratepage = buffer_migrate_page, 2834 .migratepage = buffer_migrate_page,
2835 .is_partially_uptodate = block_is_partially_uptodate,
1845}; 2836};
1846 2837
1847static const struct address_space_operations ext4_journalled_aops = { 2838static const struct address_space_operations ext4_journalled_aops = {
1848 .readpage = ext4_readpage, 2839 .readpage = ext4_readpage,
1849 .readpages = ext4_readpages, 2840 .readpages = ext4_readpages,
1850 .writepage = ext4_journalled_writepage, 2841 .writepage = ext4_journalled_writepage,
1851 .sync_page = block_sync_page, 2842 .sync_page = block_sync_page,
1852 .write_begin = ext4_write_begin, 2843 .write_begin = ext4_write_begin,
1853 .write_end = ext4_journalled_write_end, 2844 .write_end = ext4_journalled_write_end,
1854 .set_page_dirty = ext4_journalled_set_page_dirty, 2845 .set_page_dirty = ext4_journalled_set_page_dirty,
1855 .bmap = ext4_bmap, 2846 .bmap = ext4_bmap,
1856 .invalidatepage = ext4_invalidatepage, 2847 .invalidatepage = ext4_invalidatepage,
1857 .releasepage = ext4_releasepage, 2848 .releasepage = ext4_releasepage,
2849 .is_partially_uptodate = block_is_partially_uptodate,
2850};
2851
2852static const struct address_space_operations ext4_da_aops = {
2853 .readpage = ext4_readpage,
2854 .readpages = ext4_readpages,
2855 .writepage = ext4_da_writepage,
2856 .writepages = ext4_da_writepages,
2857 .sync_page = block_sync_page,
2858 .write_begin = ext4_da_write_begin,
2859 .write_end = ext4_da_write_end,
2860 .bmap = ext4_bmap,
2861 .invalidatepage = ext4_da_invalidatepage,
2862 .releasepage = ext4_releasepage,
2863 .direct_IO = ext4_direct_IO,
2864 .migratepage = buffer_migrate_page,
2865 .is_partially_uptodate = block_is_partially_uptodate,
1858}; 2866};
1859 2867
1860void ext4_set_aops(struct inode *inode) 2868void ext4_set_aops(struct inode *inode)
1861{ 2869{
1862 if (ext4_should_order_data(inode)) 2870 if (ext4_should_order_data(inode) &&
2871 test_opt(inode->i_sb, DELALLOC))
2872 inode->i_mapping->a_ops = &ext4_da_aops;
2873 else if (ext4_should_order_data(inode))
1863 inode->i_mapping->a_ops = &ext4_ordered_aops; 2874 inode->i_mapping->a_ops = &ext4_ordered_aops;
2875 else if (ext4_should_writeback_data(inode) &&
2876 test_opt(inode->i_sb, DELALLOC))
2877 inode->i_mapping->a_ops = &ext4_da_aops;
1864 else if (ext4_should_writeback_data(inode)) 2878 else if (ext4_should_writeback_data(inode))
1865 inode->i_mapping->a_ops = &ext4_writeback_aops; 2879 inode->i_mapping->a_ops = &ext4_writeback_aops;
1866 else 2880 else
@@ -1873,7 +2887,7 @@ void ext4_set_aops(struct inode *inode)
1873 * This required during truncate. We need to physically zero the tail end 2887 * This required during truncate. We need to physically zero the tail end
1874 * of that block so it doesn't yield old data if the file is later grown. 2888 * of that block so it doesn't yield old data if the file is later grown.
1875 */ 2889 */
1876int ext4_block_truncate_page(handle_t *handle, struct page *page, 2890int ext4_block_truncate_page(handle_t *handle,
1877 struct address_space *mapping, loff_t from) 2891 struct address_space *mapping, loff_t from)
1878{ 2892{
1879 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 2893 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2896,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1882 ext4_lblk_t iblock; 2896 ext4_lblk_t iblock;
1883 struct inode *inode = mapping->host; 2897 struct inode *inode = mapping->host;
1884 struct buffer_head *bh; 2898 struct buffer_head *bh;
2899 struct page *page;
1885 int err = 0; 2900 int err = 0;
1886 2901
2902 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
2903 if (!page)
2904 return -EINVAL;
2905
1887 blocksize = inode->i_sb->s_blocksize; 2906 blocksize = inode->i_sb->s_blocksize;
1888 length = blocksize - (offset & (blocksize - 1)); 2907 length = blocksize - (offset & (blocksize - 1));
1889 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 2908 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2975,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1956 err = ext4_journal_dirty_metadata(handle, bh); 2975 err = ext4_journal_dirty_metadata(handle, bh);
1957 } else { 2976 } else {
1958 if (ext4_should_order_data(inode)) 2977 if (ext4_should_order_data(inode))
1959 err = ext4_journal_dirty_data(handle, bh); 2978 err = ext4_jbd2_file_inode(handle, inode);
1960 mark_buffer_dirty(bh); 2979 mark_buffer_dirty(bh);
1961 } 2980 }
1962 2981
@@ -2179,7 +3198,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
2179 3198
2180 if (this_bh) { 3199 if (this_bh) {
2181 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3200 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
2182 ext4_journal_dirty_metadata(handle, this_bh); 3201
3202 /*
3203 * The buffer head should have an attached journal head at this
3204 * point. However, if the data is corrupted and an indirect
3205 * block pointed to itself, it would have been detached when
3206 * the block was cleared. Check for this instead of OOPSing.
3207 */
3208 if (bh2jh(this_bh))
3209 ext4_journal_dirty_metadata(handle, this_bh);
3210 else
3211 ext4_error(inode->i_sb, __func__,
3212 "circular indirect block detected, "
3213 "inode=%lu, block=%llu",
3214 inode->i_ino,
3215 (unsigned long long) this_bh->b_blocknr);
2183 } 3216 }
2184} 3217}
2185 3218
@@ -2305,6 +3338,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2305 } 3338 }
2306} 3339}
2307 3340
3341int ext4_can_truncate(struct inode *inode)
3342{
3343 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3344 return 0;
3345 if (S_ISREG(inode->i_mode))
3346 return 1;
3347 if (S_ISDIR(inode->i_mode))
3348 return 1;
3349 if (S_ISLNK(inode->i_mode))
3350 return !ext4_inode_is_fast_symlink(inode);
3351 return 0;
3352}
3353
2308/* 3354/*
2309 * ext4_truncate() 3355 * ext4_truncate()
2310 * 3356 *
@@ -2347,51 +3393,25 @@ void ext4_truncate(struct inode *inode)
2347 int n; 3393 int n;
2348 ext4_lblk_t last_block; 3394 ext4_lblk_t last_block;
2349 unsigned blocksize = inode->i_sb->s_blocksize; 3395 unsigned blocksize = inode->i_sb->s_blocksize;
2350 struct page *page;
2351 3396
2352 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 3397 if (!ext4_can_truncate(inode))
2353 S_ISLNK(inode->i_mode)))
2354 return;
2355 if (ext4_inode_is_fast_symlink(inode))
2356 return;
2357 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2358 return; 3398 return;
2359 3399
2360 /*
2361 * We have to lock the EOF page here, because lock_page() nests
2362 * outside jbd2_journal_start().
2363 */
2364 if ((inode->i_size & (blocksize - 1)) == 0) {
2365 /* Block boundary? Nothing to do */
2366 page = NULL;
2367 } else {
2368 page = grab_cache_page(mapping,
2369 inode->i_size >> PAGE_CACHE_SHIFT);
2370 if (!page)
2371 return;
2372 }
2373
2374 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3400 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
2375 ext4_ext_truncate(inode, page); 3401 ext4_ext_truncate(inode);
2376 return; 3402 return;
2377 } 3403 }
2378 3404
2379 handle = start_transaction(inode); 3405 handle = start_transaction(inode);
2380 if (IS_ERR(handle)) { 3406 if (IS_ERR(handle))
2381 if (page) {
2382 clear_highpage(page);
2383 flush_dcache_page(page);
2384 unlock_page(page);
2385 page_cache_release(page);
2386 }
2387 return; /* AKPM: return what? */ 3407 return; /* AKPM: return what? */
2388 }
2389 3408
2390 last_block = (inode->i_size + blocksize-1) 3409 last_block = (inode->i_size + blocksize-1)
2391 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3410 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
2392 3411
2393 if (page) 3412 if (inode->i_size & (blocksize - 1))
2394 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 3413 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
3414 goto out_stop;
2395 3415
2396 n = ext4_block_to_path(inode, last_block, offsets, NULL); 3416 n = ext4_block_to_path(inode, last_block, offsets, NULL);
2397 if (n == 0) 3417 if (n == 0)
@@ -2410,6 +3430,11 @@ void ext4_truncate(struct inode *inode)
2410 goto out_stop; 3430 goto out_stop;
2411 3431
2412 /* 3432 /*
3433 * From here we block out all ext4_get_block() callers who want to
3434 * modify the block allocation tree.
3435 */
3436 down_write(&ei->i_data_sem);
3437 /*
2413 * The orphan list entry will now protect us from any crash which 3438 * The orphan list entry will now protect us from any crash which
2414 * occurs before the truncate completes, so it is now safe to propagate 3439 * occurs before the truncate completes, so it is now safe to propagate
2415 * the new, shorter inode size (held for now in i_size) into the 3440 * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3443,6 @@ void ext4_truncate(struct inode *inode)
2418 */ 3443 */
2419 ei->i_disksize = inode->i_size; 3444 ei->i_disksize = inode->i_size;
2420 3445
2421 /*
2422 * From here we block out all ext4_get_block() callers who want to
2423 * modify the block allocation tree.
2424 */
2425 down_write(&ei->i_data_sem);
2426
2427 if (n == 1) { /* direct blocks */ 3446 if (n == 1) { /* direct blocks */
2428 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3447 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
2429 i_data + EXT4_NDIR_BLOCKS); 3448 i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4126,14 @@ int ext4_write_inode(struct inode *inode, int wait)
3107 * be freed, so we have a strong guarantee that no future commit will 4126 * be freed, so we have a strong guarantee that no future commit will
3108 * leave these blocks visible to the user.) 4127 * leave these blocks visible to the user.)
3109 * 4128 *
3110 * Called with inode->sem down. 4129 * Another thing we have to assure is that if we are in ordered mode
4130 * and inode is still attached to the committing transaction, we must
4131 * we start writeout of all the dirty pages which are being truncated.
4132 * This way we are sure that all the data written in the previous
4133 * transaction are already on disk (truncate waits for pages under
4134 * writeback).
4135 *
4136 * Called with inode->i_mutex down.
3111 */ 4137 */
3112int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4138int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3113{ 4139{
@@ -3173,6 +4199,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3173 if (!error) 4199 if (!error)
3174 error = rc; 4200 error = rc;
3175 ext4_journal_stop(handle); 4201 ext4_journal_stop(handle);
4202
4203 if (ext4_should_order_data(inode)) {
4204 error = ext4_begin_ordered_truncate(inode,
4205 attr->ia_size);
4206 if (error) {
4207 /* Do as much error cleanup as possible */
4208 handle = ext4_journal_start(inode, 3);
4209 if (IS_ERR(handle)) {
4210 ext4_orphan_del(NULL, inode);
4211 goto err_out;
4212 }
4213 ext4_orphan_del(handle, inode);
4214 ext4_journal_stop(handle);
4215 goto err_out;
4216 }
4217 }
3176 } 4218 }
3177 4219
3178 rc = inode_setattr(inode, attr); 4220 rc = inode_setattr(inode, attr);
@@ -3193,6 +4235,32 @@ err_out:
3193 return error; 4235 return error;
3194} 4236}
3195 4237
4238int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4239 struct kstat *stat)
4240{
4241 struct inode *inode;
4242 unsigned long delalloc_blocks;
4243
4244 inode = dentry->d_inode;
4245 generic_fillattr(inode, stat);
4246
4247 /*
4248 * We can't update i_blocks if the block allocation is delayed
4249 * otherwise in the case of system crash before the real block
4250 * allocation is done, we will have i_blocks inconsistent with
4251 * on-disk file blocks.
4252 * We always keep i_blocks updated together with real
4253 * allocation. But to not confuse with user, stat
4254 * will return the blocks that include the delayed allocation
4255 * blocks for this file.
4256 */
4257 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4258 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4259 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4260
4261 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4262 return 0;
4263}
3196 4264
3197/* 4265/*
3198 * How many blocks doth make a writepage()? 4266 * How many blocks doth make a writepage()?
@@ -3506,3 +4574,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
3506 4574
3507 return err; 4575 return err;
3508} 4576}
4577
4578static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4579{
4580 return !buffer_mapped(bh);
4581}
4582
4583int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4584{
4585 loff_t size;
4586 unsigned long len;
4587 int ret = -EINVAL;
4588 struct file *file = vma->vm_file;
4589 struct inode *inode = file->f_path.dentry->d_inode;
4590 struct address_space *mapping = inode->i_mapping;
4591
4592 /*
4593 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
4594 * get i_mutex because we are already holding mmap_sem.
4595 */
4596 down_read(&inode->i_alloc_sem);
4597 size = i_size_read(inode);
4598 if (page->mapping != mapping || size <= page_offset(page)
4599 || !PageUptodate(page)) {
4600 /* page got truncated from under us? */
4601 goto out_unlock;
4602 }
4603 ret = 0;
4604 if (PageMappedToDisk(page))
4605 goto out_unlock;
4606
4607 if (page->index == size >> PAGE_CACHE_SHIFT)
4608 len = size & ~PAGE_CACHE_MASK;
4609 else
4610 len = PAGE_CACHE_SIZE;
4611
4612 if (page_has_buffers(page)) {
4613 /* return if we have all the buffers mapped */
4614 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4615 ext4_bh_unmapped))
4616 goto out_unlock;
4617 }
4618 /*
4619 * OK, we need to fill the hole... Do write_begin write_end
4620 * to do block allocation/reservation.We are not holding
4621 * inode.i__mutex here. That allow * parallel write_begin,
4622 * write_end call. lock_page prevent this from happening
4623 * on the same page though
4624 */
4625 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4626 len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
4627 if (ret < 0)
4628 goto out_unlock;
4629 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4630 len, len, page, NULL);
4631 if (ret < 0)
4632 goto out_unlock;
4633 ret = 0;
4634out_unlock:
4635 up_read(&inode->i_alloc_sem);
4636 return ret;
4637}