aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-02 12:39:34 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-02 12:39:34 -0400
commit9e239bb93914e1c832d54161c7f8f398d0c914ab (patch)
tree0fe11e8e717152660ad77d77e66bf0f1695d7ed1
parent63580e51bb3e7ec459501165884e5f815a7a9322 (diff)
parent6ae06ff51eab5dcbbf959b05ce0f11003a305ba5 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 update from Ted Ts'o: "Lots of bug fixes, cleanups and optimizations. In the bug fixes category, of note is a fix for on-line resizing file systems where the block size is smaller than the page size (i.e., file systems 1k blocks on x86, or more interestingly file systems with 4k blocks on Power or ia64 systems.) In the cleanup category, the ext4's punch hole implementation was significantly improved by Lukas Czerner, and now supports bigalloc file systems. In addition, Jan Kara significantly cleaned up the write submission code path. We also improved error checking and added a few sanity checks. In the optimizations category, two major optimizations deserve mention. The first is that ext4_writepages() is now used for nodelalloc and ext3 compatibility mode. This allows writes to be submitted much more efficiently as a single bio request, instead of being sent as individual 4k writes into the block layer (which then relied on the elevator code to coalesce the requests in the block queue). Secondly, the extent cache shrink mechanism, which was introduce in 3.9, no longer has a scalability bottleneck caused by the i_es_lru spinlock. Other optimizations include some changes to reduce CPU usage and to avoid issuing empty commits unnecessarily." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (86 commits) ext4: optimize starting extent in ext4_ext_rm_leaf() jbd2: invalidate handle if jbd2_journal_restart() fails ext4: translate flag bits to strings in tracepoints ext4: fix up error handling for mpage_map_and_submit_extent() jbd2: fix theoretical race in jbd2__journal_restart ext4: only zero partial blocks in ext4_zero_partial_blocks() ext4: check error return from ext4_write_inline_data_end() ext4: delete unnecessary C statements ext3,ext4: don't mess with dir_file->f_pos in htree_dirblock_to_tree() jbd2: move superblock checksum calculation to jbd2_write_superblock() ext4: pass inode pointer instead of file pointer to punch hole ext4: improve free space calculation for inline_data ext4: reduce object size when !CONFIG_PRINTK ext4: improve extent cache shrink mechanism to avoid to burn CPU time ext4: implement error handling of ext4_mb_new_preallocation() ext4: fix corruption when online resizing a fs with 1K block size ext4: delete unused variables ext4: return FIEMAP_EXTENT_UNKNOWN for delalloc extents jbd2: remove debug dependency on debug_fs and update Kconfig help text jbd2: use a single printk for jbd_debug() ...
-rw-r--r--Documentation/filesystems/Locking6
-rw-r--r--Documentation/filesystems/vfs.txt20
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/afs/file.c10
-rw-r--r--fs/btrfs/disk-io.c3
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/ceph/addr.c15
-rw-r--r--fs/cifs/file.c5
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/ext3/inode.c9
-rw-r--r--fs/ext3/namei.c7
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/ext4.h187
-rw-r--r--fs/ext4/ext4_jbd2.c58
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c193
-rw-r--r--fs/ext4/extents_status.c75
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/file.c14
-rw-r--r--fs/ext4/fsync.c52
-rw-r--r--fs/ext4/ialloc.c3
-rw-r--r--fs/ext4/indirect.c40
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c1751
-rw-r--r--fs/ext4/mballoc.c21
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/page-io.c325
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c155
-rw-r--r--fs/f2fs/data.c3
-rw-r--r--fs/f2fs/node.c3
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/jbd/transaction.c19
-rw-r--r--fs/jbd2/Kconfig6
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c184
-rw-r--r--fs/jbd2/journal.c166
-rw-r--r--fs/jbd2/recovery.c11
-rw-r--r--fs/jbd2/revoke.c49
-rw-r--r--fs/jbd2/transaction.c526
-rw-r--r--fs/jfs/jfs_metapage.c5
-rw-r--r--fs/logfs/file.c3
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/nfs/file.c8
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/xfs/xfs_aops.c14
-rw-r--r--fs/xfs/xfs_trace.h15
-rw-r--r--include/linux/buffer_head.h3
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/jbd.h28
-rw-r--r--include/linux/jbd2.h175
-rw-r--r--include/linux/jbd_common.h26
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/trace/events/ext3.h12
-rw-r--r--include/trace/events/ext4.h304
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/truncate.c117
63 files changed, 2652 insertions, 2170 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index bdd82b2339d9..9858f337529c 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -189,7 +189,7 @@ prototypes:
189 loff_t pos, unsigned len, unsigned copied, 189 loff_t pos, unsigned len, unsigned copied,
190 struct page *page, void *fsdata); 190 struct page *page, void *fsdata);
191 sector_t (*bmap)(struct address_space *, sector_t); 191 sector_t (*bmap)(struct address_space *, sector_t);
192 int (*invalidatepage) (struct page *, unsigned long); 192 void (*invalidatepage) (struct page *, unsigned int, unsigned int);
193 int (*releasepage) (struct page *, int); 193 int (*releasepage) (struct page *, int);
194 void (*freepage)(struct page *); 194 void (*freepage)(struct page *);
195 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 195 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
@@ -310,8 +310,8 @@ filesystems and by the swapper. The latter will eventually go away. Please,
310keep it that way and don't breed new callers. 310keep it that way and don't breed new callers.
311 311
312 ->invalidatepage() is called when the filesystem must attempt to drop 312 ->invalidatepage() is called when the filesystem must attempt to drop
313some or all of the buffers from the page when it is being truncated. It 313some or all of the buffers from the page when it is being truncated. It
314returns zero on success. If ->invalidatepage is zero, the kernel uses 314returns zero on success. If ->invalidatepage is zero, the kernel uses
315block_invalidatepage() instead. 315block_invalidatepage() instead.
316 316
317 ->releasepage() is called when the kernel is about to try to drop the 317 ->releasepage() is called when the kernel is about to try to drop the
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4a35f6614a66..e6bd1ffd821e 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -549,7 +549,7 @@ struct address_space_operations
549------------------------------- 549-------------------------------
550 550
551This describes how the VFS can manipulate mapping of a file to page cache in 551This describes how the VFS can manipulate mapping of a file to page cache in
552your filesystem. As of kernel 2.6.22, the following members are defined: 552your filesystem. The following members are defined:
553 553
554struct address_space_operations { 554struct address_space_operations {
555 int (*writepage)(struct page *page, struct writeback_control *wbc); 555 int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -566,7 +566,7 @@ struct address_space_operations {
566 loff_t pos, unsigned len, unsigned copied, 566 loff_t pos, unsigned len, unsigned copied,
567 struct page *page, void *fsdata); 567 struct page *page, void *fsdata);
568 sector_t (*bmap)(struct address_space *, sector_t); 568 sector_t (*bmap)(struct address_space *, sector_t);
569 int (*invalidatepage) (struct page *, unsigned long); 569 void (*invalidatepage) (struct page *, unsigned int, unsigned int);
570 int (*releasepage) (struct page *, int); 570 int (*releasepage) (struct page *, int);
571 void (*freepage)(struct page *); 571 void (*freepage)(struct page *);
572 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 572 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
@@ -685,14 +685,14 @@ struct address_space_operations {
685 invalidatepage: If a page has PagePrivate set, then invalidatepage 685 invalidatepage: If a page has PagePrivate set, then invalidatepage
686 will be called when part or all of the page is to be removed 686 will be called when part or all of the page is to be removed
687 from the address space. This generally corresponds to either a 687 from the address space. This generally corresponds to either a
688 truncation or a complete invalidation of the address space 688 truncation, punch hole or a complete invalidation of the address
689 (in the latter case 'offset' will always be 0). 689 space (in the latter case 'offset' will always be 0 and 'length'
690 Any private data associated with the page should be updated 690 will be PAGE_CACHE_SIZE). Any private data associated with the page
691 to reflect this truncation. If offset is 0, then 691 should be updated to reflect this truncation. If offset is 0 and
692 the private data should be released, because the page 692 length is PAGE_CACHE_SIZE, then the private data should be released,
693 must be able to be completely discarded. This may be done by 693 because the page must be able to be completely discarded. This may
694 calling the ->releasepage function, but in this case the 694 be done by calling the ->releasepage function, but in this case the
695 release MUST succeed. 695 release MUST succeed.
696 696
697 releasepage: releasepage is called on PagePrivate pages to indicate 697 releasepage: releasepage is called on PagePrivate pages to indicate
698 that the page should be freed if possible. ->releasepage 698 that the page should be freed if possible. ->releasepage
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c580b4..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
148 * @offset: offset in the page 148 * @offset: offset in the page
149 */ 149 */
150 150
151static void v9fs_invalidate_page(struct page *page, unsigned long offset) 151static void v9fs_invalidate_page(struct page *page, unsigned int offset,
152 unsigned int length)
152{ 153{
153 /* 154 /*
154 * If called with zero offset, we should release 155 * If called with zero offset, we should release
155 * the private state assocated with the page 156 * the private state assocated with the page
156 */ 157 */
157 if (offset == 0) 158 if (offset == 0 && length == PAGE_CACHE_SIZE)
158 v9fs_fscache_invalidate_page(page); 159 v9fs_fscache_invalidate_page(page);
159} 160}
160 161
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
22static void afs_invalidatepage(struct page *page, unsigned long offset); 22static void afs_invalidatepage(struct page *page, unsigned int offset,
23 unsigned int length);
23static int afs_releasepage(struct page *page, gfp_t gfp_flags); 24static int afs_releasepage(struct page *page, gfp_t gfp_flags);
24static int afs_launder_page(struct page *page); 25static int afs_launder_page(struct page *page);
25 26
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
310 * - release a page and clean up its private data if offset is 0 (indicating 311 * - release a page and clean up its private data if offset is 0 (indicating
311 * the entire page) 312 * the entire page)
312 */ 313 */
313static void afs_invalidatepage(struct page *page, unsigned long offset) 314static void afs_invalidatepage(struct page *page, unsigned int offset,
315 unsigned int length)
314{ 316{
315 struct afs_writeback *wb = (struct afs_writeback *) page_private(page); 317 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
316 318
317 _enter("{%lu},%lu", page->index, offset); 319 _enter("{%lu},%u,%u", page->index, offset, length);
318 320
319 BUG_ON(!PageLocked(page)); 321 BUG_ON(!PageLocked(page));
320 322
321 /* we clean up only if the entire page is being invalidated */ 323 /* we clean up only if the entire page is being invalidated */
322 if (offset == 0) { 324 if (offset == 0 && length == PAGE_CACHE_SIZE) {
323#ifdef CONFIG_AFS_FSCACHE 325#ifdef CONFIG_AFS_FSCACHE
324 if (PageFsCache(page)) { 326 if (PageFsCache(page)) {
325 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 327 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8b60b660c8f..b0292b3ead54 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
1013 return try_release_extent_buffer(page); 1013 return try_release_extent_buffer(page);
1014} 1014}
1015 1015
1016static void btree_invalidatepage(struct page *page, unsigned long offset) 1016static void btree_invalidatepage(struct page *page, unsigned int offset,
1017 unsigned int length)
1017{ 1018{
1018 struct extent_io_tree *tree; 1019 struct extent_io_tree *tree;
1019 tree = &BTRFS_I(page->mapping->host)->io_tree; 1020 tree = &BTRFS_I(page->mapping->host)->io_tree;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7e7afb4a872..6bca9472f313 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2957,7 +2957,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2957 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2957 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2958 if (page->index > end_index || 2958 if (page->index > end_index ||
2959 (page->index == end_index && !pg_offset)) { 2959 (page->index == end_index && !pg_offset)) {
2960 page->mapping->a_ops->invalidatepage(page, 0); 2960 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
2961 unlock_page(page); 2961 unlock_page(page);
2962 return 0; 2962 return 0;
2963 } 2963 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a46b656d08de..4f9d16b70d3d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7493,7 +7493,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7493 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 7493 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7494} 7494}
7495 7495
7496static void btrfs_invalidatepage(struct page *page, unsigned long offset) 7496static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7497 unsigned int length)
7497{ 7498{
7498 struct inode *inode = page->mapping->host; 7499 struct inode *inode = page->mapping->host;
7499 struct extent_io_tree *tree; 7500 struct extent_io_tree *tree;
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..f93392e2df12 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1454,7 +1454,8 @@ static void discard_buffer(struct buffer_head * bh)
1454 * block_invalidatepage - invalidate part or all of a buffer-backed page 1454 * block_invalidatepage - invalidate part or all of a buffer-backed page
1455 * 1455 *
1456 * @page: the page which is affected 1456 * @page: the page which is affected
1457 * @offset: the index of the truncation point 1457 * @offset: start of the range to invalidate
1458 * @length: length of the range to invalidate
1458 * 1459 *
1459 * block_invalidatepage() is called when all or part of the page has become 1460 * block_invalidatepage() is called when all or part of the page has become
1460 * invalidated by a truncate operation. 1461 * invalidated by a truncate operation.
@@ -1465,15 +1466,22 @@ static void discard_buffer(struct buffer_head * bh)
1465 * point. Because the caller is about to free (and possibly reuse) those 1466 * point. Because the caller is about to free (and possibly reuse) those
1466 * blocks on-disk. 1467 * blocks on-disk.
1467 */ 1468 */
1468void block_invalidatepage(struct page *page, unsigned long offset) 1469void block_invalidatepage(struct page *page, unsigned int offset,
1470 unsigned int length)
1469{ 1471{
1470 struct buffer_head *head, *bh, *next; 1472 struct buffer_head *head, *bh, *next;
1471 unsigned int curr_off = 0; 1473 unsigned int curr_off = 0;
1474 unsigned int stop = length + offset;
1472 1475
1473 BUG_ON(!PageLocked(page)); 1476 BUG_ON(!PageLocked(page));
1474 if (!page_has_buffers(page)) 1477 if (!page_has_buffers(page))
1475 goto out; 1478 goto out;
1476 1479
1480 /*
1481 * Check for overflow
1482 */
1483 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1484
1477 head = page_buffers(page); 1485 head = page_buffers(page);
1478 bh = head; 1486 bh = head;
1479 do { 1487 do {
@@ -1481,6 +1489,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
1481 next = bh->b_this_page; 1489 next = bh->b_this_page;
1482 1490
1483 /* 1491 /*
1492 * Are we still fully in range ?
1493 */
1494 if (next_off > stop)
1495 goto out;
1496
1497 /*
1484 * is this block fully invalidated? 1498 * is this block fully invalidated?
1485 */ 1499 */
1486 if (offset <= curr_off) 1500 if (offset <= curr_off)
@@ -1501,6 +1515,7 @@ out:
1501} 1515}
1502EXPORT_SYMBOL(block_invalidatepage); 1516EXPORT_SYMBOL(block_invalidatepage);
1503 1517
1518
1504/* 1519/*
1505 * We attach and possibly dirty the buffers atomically wrt 1520 * We attach and possibly dirty the buffers atomically wrt
1506 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers 1521 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2841,7 +2856,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2841 * they may have been added in ext3_writepage(). Make them 2856 * they may have been added in ext3_writepage(). Make them
2842 * freeable here, so the page does not leak. 2857 * freeable here, so the page does not leak.
2843 */ 2858 */
2844 do_invalidatepage(page, 0); 2859 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2845 unlock_page(page); 2860 unlock_page(page);
2846 return 0; /* don't care */ 2861 return 0; /* don't care */
2847 } 2862 }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac101040..38b5c1bc6776 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
143 * dirty page counters appropriately. Only called if there is private 143 * dirty page counters appropriately. Only called if there is private
144 * data on the page. 144 * data on the page.
145 */ 145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset) 146static void ceph_invalidatepage(struct page *page, unsigned int offset,
147 unsigned int length)
147{ 148{
148 struct inode *inode; 149 struct inode *inode;
149 struct ceph_inode_info *ci; 150 struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
163 if (!PageDirty(page)) 164 if (!PageDirty(page))
164 pr_err("%p invalidatepage %p page not dirty\n", inode, page); 165 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
165 166
166 if (offset == 0) 167 if (offset == 0 && length == PAGE_CACHE_SIZE)
167 ClearPageChecked(page); 168 ClearPageChecked(page);
168 169
169 ci = ceph_inode(inode); 170 ci = ceph_inode(inode);
170 if (offset == 0) { 171 if (offset == 0 && length == PAGE_CACHE_SIZE) {
171 dout("%p invalidatepage %p idx %lu full dirty page %lu\n", 172 dout("%p invalidatepage %p idx %lu full dirty page\n",
172 inode, page, page->index, offset); 173 inode, page, page->index);
173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
174 ceph_put_snap_context(snapc); 175 ceph_put_snap_context(snapc);
175 page->private = 0; 176 page->private = 0;
176 ClearPagePrivate(page); 177 ClearPagePrivate(page);
177 } else { 178 } else {
178 dout("%p invalidatepage %p idx %lu partial dirty page\n", 179 dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
179 inode, page, page->index); 180 inode, page, page->index, offset, length);
180 } 181 }
181} 182}
182 183
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 48b29d24c9f4..4d8ba8d491e5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3546,11 +3546,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
3546 return cifs_fscache_release_page(page, gfp); 3546 return cifs_fscache_release_page(page, gfp);
3547} 3547}
3548 3548
3549static void cifs_invalidate_page(struct page *page, unsigned long offset) 3549static void cifs_invalidate_page(struct page *page, unsigned int offset,
3550 unsigned int length)
3550{ 3551{
3551 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); 3552 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
3552 3553
3553 if (offset == 0) 3554 if (offset == 0 && length == PAGE_CACHE_SIZE)
3554 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); 3555 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
3555} 3556}
3556 3557
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
953 return 0; 953 return 0;
954} 954}
955 955
956static void exofs_invalidatepage(struct page *page, unsigned long offset) 956static void exofs_invalidatepage(struct page *page, unsigned int offset,
957 unsigned int length)
957{ 958{
958 EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); 959 EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
960 page->index, offset, length);
959 WARN_ON(1); 961 WARN_ON(1);
960} 962}
961 963
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c712825640..f67668f724ba 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
1825 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); 1825 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1826} 1826}
1827 1827
1828static void ext3_invalidatepage(struct page *page, unsigned long offset) 1828static void ext3_invalidatepage(struct page *page, unsigned int offset,
1829 unsigned int length)
1829{ 1830{
1830 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1831 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1831 1832
1832 trace_ext3_invalidatepage(page, offset); 1833 trace_ext3_invalidatepage(page, offset, length);
1833 1834
1834 /* 1835 /*
1835 * If it's a full truncate we just forget about the pending dirtying 1836 * If it's a full truncate we just forget about the pending dirtying
1836 */ 1837 */
1837 if (offset == 0) 1838 if (offset == 0 && length == PAGE_CACHE_SIZE)
1838 ClearPageChecked(page); 1839 ClearPageChecked(page);
1839 1840
1840 journal_invalidatepage(journal, page, offset); 1841 journal_invalidatepage(journal, page, offset, length);
1841} 1842}
1842 1843
1843static int ext3_releasepage(struct page *page, gfp_t wait) 1844static int ext3_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 692de13e3596..cea8ecf3e76e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) 577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
578 +((char *)de - bh->b_data))) { 578 +((char *)de - bh->b_data))) {
579 /* On error, skip the f_pos to the next block. */ 579 /* silently ignore the rest of the block */
580 dir_file->f_pos = (dir_file->f_pos | 580 break;
581 (dir->i_sb->s_blocksize - 1)) + 1;
582 brelse (bh);
583 return count;
584 } 581 }
585 ext3fs_dirhash(de->name, de->name_len, hinfo); 582 ext3fs_dirhash(de->name, de->name_len, hinfo);
586 if ((hinfo->hash < start_hash) || 583 if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..58339393fa6e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
682 682
683static inline int test_root(ext4_group_t a, int b) 683static inline int test_root(ext4_group_t a, int b)
684{ 684{
685 int num = b; 685 while (1) {
686 686 if (a < b)
687 while (a > num) 687 return 0;
688 num *= b; 688 if (a == b)
689 return num == a; 689 return 1;
690 if ((a % b) != 0)
691 return 0;
692 a = a / b;
693 }
690} 694}
691 695
692static int ext4_group_sparse(ext4_group_t group) 696static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4af03ea84aa3..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,38 +177,28 @@ struct ext4_map_blocks {
177}; 177};
178 178
179/* 179/*
180 * For delayed allocation tracking
181 */
182struct mpage_da_data {
183 struct inode *inode;
184 sector_t b_blocknr; /* start block number of extent */
185 size_t b_size; /* size of extent */
186 unsigned long b_state; /* state of the extent */
187 unsigned long first_page, next_page; /* extent of pages */
188 struct writeback_control *wbc;
189 int io_done;
190 int pages_written;
191 int retval;
192};
193
194/*
195 * Flags for ext4_io_end->flags 180 * Flags for ext4_io_end->flags
196 */ 181 */
197#define EXT4_IO_END_UNWRITTEN 0x0001 182#define EXT4_IO_END_UNWRITTEN 0x0001
198#define EXT4_IO_END_ERROR 0x0002 183#define EXT4_IO_END_DIRECT 0x0002
199#define EXT4_IO_END_DIRECT 0x0004
200 184
201/* 185/*
202 * For converting uninitialized extents on a work queue. 186 * For converting uninitialized extents on a work queue. 'handle' is used for
187 * buffered writeback.
203 */ 188 */
204typedef struct ext4_io_end { 189typedef struct ext4_io_end {
205 struct list_head list; /* per-file finished IO list */ 190 struct list_head list; /* per-file finished IO list */
191 handle_t *handle; /* handle reserved for extent
192 * conversion */
206 struct inode *inode; /* file being written to */ 193 struct inode *inode; /* file being written to */
194 struct bio *bio; /* Linked list of completed
195 * bios covering the extent */
207 unsigned int flag; /* unwritten or not */ 196 unsigned int flag; /* unwritten or not */
208 loff_t offset; /* offset in the file */ 197 loff_t offset; /* offset in the file */
209 ssize_t size; /* size of the extent */ 198 ssize_t size; /* size of the extent */
210 struct kiocb *iocb; /* iocb struct for AIO */ 199 struct kiocb *iocb; /* iocb struct for AIO */
211 int result; /* error value for AIO */ 200 int result; /* error value for AIO */
201 atomic_t count; /* reference counter */
212} ext4_io_end_t; 202} ext4_io_end_t;
213 203
214struct ext4_io_submit { 204struct ext4_io_submit {
@@ -581,11 +571,6 @@ enum {
581#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 571#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
582 572
583/* 573/*
584 * Flags used by ext4_discard_partial_page_buffers
585 */
586#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
587
588/*
589 * ioctl commands 574 * ioctl commands
590 */ 575 */
591#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS 576#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -879,6 +864,7 @@ struct ext4_inode_info {
879 rwlock_t i_es_lock; 864 rwlock_t i_es_lock;
880 struct list_head i_es_lru; 865 struct list_head i_es_lru;
881 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 866 unsigned int i_es_lru_nr; /* protected by i_es_lock */
867 unsigned long i_touch_when; /* jiffies of last accessing */
882 868
883 /* ialloc */ 869 /* ialloc */
884 ext4_group_t i_last_alloc_group; 870 ext4_group_t i_last_alloc_group;
@@ -903,12 +889,22 @@ struct ext4_inode_info {
903 qsize_t i_reserved_quota; 889 qsize_t i_reserved_quota;
904#endif 890#endif
905 891
906 /* completed IOs that might need unwritten extents handling */ 892 /* Lock protecting lists below */
907 struct list_head i_completed_io_list;
908 spinlock_t i_completed_io_lock; 893 spinlock_t i_completed_io_lock;
894 /*
895 * Completed IOs that need unwritten extents handling and have
896 * transaction reserved
897 */
898 struct list_head i_rsv_conversion_list;
899 /*
900 * Completed IOs that need unwritten extents handling and don't have
901 * transaction reserved
902 */
903 struct list_head i_unrsv_conversion_list;
909 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 904 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
910 atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 905 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
911 struct work_struct i_unwritten_work; /* deferred extent conversion */ 906 struct work_struct i_rsv_conversion_work;
907 struct work_struct i_unrsv_conversion_work;
912 908
913 spinlock_t i_block_reservation_lock; 909 spinlock_t i_block_reservation_lock;
914 910
@@ -1245,7 +1241,6 @@ struct ext4_sb_info {
1245 unsigned int s_mb_stats; 1241 unsigned int s_mb_stats;
1246 unsigned int s_mb_order2_reqs; 1242 unsigned int s_mb_order2_reqs;
1247 unsigned int s_mb_group_prealloc; 1243 unsigned int s_mb_group_prealloc;
1248 unsigned int s_max_writeback_mb_bump;
1249 unsigned int s_max_dir_size_kb; 1244 unsigned int s_max_dir_size_kb;
1250 /* where last allocation was done - for stream allocation */ 1245 /* where last allocation was done - for stream allocation */
1251 unsigned long s_mb_last_group; 1246 unsigned long s_mb_last_group;
@@ -1281,8 +1276,10 @@ struct ext4_sb_info {
1281 struct flex_groups *s_flex_groups; 1276 struct flex_groups *s_flex_groups;
1282 ext4_group_t s_flex_groups_allocated; 1277 ext4_group_t s_flex_groups_allocated;
1283 1278
1284 /* workqueue for dio unwritten */ 1279 /* workqueue for unreserved extent convertions (dio) */
1285 struct workqueue_struct *dio_unwritten_wq; 1280 struct workqueue_struct *unrsv_conversion_wq;
1281 /* workqueue for reserved extent conversions (buffered io) */
1282 struct workqueue_struct *rsv_conversion_wq;
1286 1283
1287 /* timer for periodic error stats printing */ 1284 /* timer for periodic error stats printing */
1288 struct timer_list s_err_report; 1285 struct timer_list s_err_report;
@@ -1307,6 +1304,7 @@ struct ext4_sb_info {
1307 /* Reclaim extents from extent status tree */ 1304 /* Reclaim extents from extent status tree */
1308 struct shrinker s_es_shrinker; 1305 struct shrinker s_es_shrinker;
1309 struct list_head s_es_lru; 1306 struct list_head s_es_lru;
1307 unsigned long s_es_last_sorted;
1310 struct percpu_counter s_extent_cache_cnt; 1308 struct percpu_counter s_extent_cache_cnt;
1311 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1309 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1312}; 1310};
@@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1342 struct ext4_io_end *io_end) 1340 struct ext4_io_end *io_end)
1343{ 1341{
1344 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1342 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1343 /* Writeback has to have coversion transaction reserved */
1344 WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
1345 !(io_end->flag & EXT4_IO_END_DIRECT));
1345 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1346 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1346 atomic_inc(&EXT4_I(inode)->i_unwritten); 1347 atomic_inc(&EXT4_I(inode)->i_unwritten);
1347 } 1348 }
@@ -1999,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
1999 2000
2000/* fsync.c */ 2001/* fsync.c */
2001extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2002extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
2002extern int ext4_flush_unwritten_io(struct inode *);
2003 2003
2004/* hash.c */ 2004/* hash.c */
2005extern int ext4fs_dirhash(const char *name, int len, struct 2005extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
2089extern int ext4_can_truncate(struct inode *inode); 2089extern int ext4_can_truncate(struct inode *inode);
2090extern void ext4_truncate(struct inode *); 2090extern void ext4_truncate(struct inode *);
2091extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); 2091extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
2093extern void ext4_set_inode_flags(struct inode *); 2093extern void ext4_set_inode_flags(struct inode *);
2094extern void ext4_get_inode_flags(struct ext4_inode_info *); 2094extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
2096extern void ext4_set_aops(struct inode *inode); 2096extern void ext4_set_aops(struct inode *inode);
2097extern int ext4_writepage_trans_blocks(struct inode *); 2097extern int ext4_writepage_trans_blocks(struct inode *);
2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2099extern int ext4_discard_partial_page_buffers(handle_t *handle, 2099extern int ext4_block_truncate_page(handle_t *handle,
2100 struct address_space *mapping, loff_t from, 2100 struct address_space *mapping, loff_t from);
2101 loff_t length, int flags); 2101extern int ext4_block_zero_page_range(handle_t *handle,
2102 struct address_space *mapping, loff_t from, loff_t length);
2103extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2104 loff_t lstart, loff_t lend);
2102extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2105extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2103extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2106extern qsize_t *ext4_get_reserved_space(struct inode *inode);
2104extern void ext4_da_update_reserve_space(struct inode *inode, 2107extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2111 const struct iovec *iov, loff_t offset, 2114 const struct iovec *iov, loff_t offset,
2112 unsigned long nr_segs); 2115 unsigned long nr_segs);
2113extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2116extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2114extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2117extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2115extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2118extern void ext4_ind_truncate(handle_t *, struct inode *inode);
2116extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 2119extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
2117 ext4_lblk_t first, ext4_lblk_t stop); 2120 ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2166 ext4_group_t ngroup); 2169 ext4_group_t ngroup);
2167extern const char *ext4_decode_error(struct super_block *sb, int errno, 2170extern const char *ext4_decode_error(struct super_block *sb, int errno,
2168 char nbuf[16]); 2171 char nbuf[16]);
2172
2169extern __printf(4, 5) 2173extern __printf(4, 5)
2170void __ext4_error(struct super_block *, const char *, unsigned int, 2174void __ext4_error(struct super_block *, const char *, unsigned int,
2171 const char *, ...); 2175 const char *, ...);
2172#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
2173 __LINE__, ## message)
2174extern __printf(5, 6) 2176extern __printf(5, 6)
2175void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, 2177void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
2176 const char *, ...); 2178 const char *, ...);
2177extern __printf(5, 6) 2179extern __printf(5, 6)
2178void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, 2180void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
2179 const char *, ...); 2181 const char *, ...);
2180extern void __ext4_std_error(struct super_block *, const char *, 2182extern void __ext4_std_error(struct super_block *, const char *,
2181 unsigned int, int); 2183 unsigned int, int);
2182extern __printf(4, 5) 2184extern __printf(4, 5)
2183void __ext4_abort(struct super_block *, const char *, unsigned int, 2185void __ext4_abort(struct super_block *, const char *, unsigned int,
2184 const char *, ...); 2186 const char *, ...);
2185#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
2186 __LINE__, ## message)
2187extern __printf(4, 5) 2187extern __printf(4, 5)
2188void __ext4_warning(struct super_block *, const char *, unsigned int, 2188void __ext4_warning(struct super_block *, const char *, unsigned int,
2189 const char *, ...); 2189 const char *, ...);
2190#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
2191 __LINE__, ## message)
2192extern __printf(3, 4) 2190extern __printf(3, 4)
2193void ext4_msg(struct super_block *, const char *, const char *, ...); 2191void __ext4_msg(struct super_block *, const char *, const char *, ...);
2194extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 2192extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
2195 const char *, unsigned int, const char *); 2193 const char *, unsigned int, const char *);
2196#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
2197 __LINE__, msg)
2198extern __printf(7, 8) 2194extern __printf(7, 8)
2199void __ext4_grp_locked_error(const char *, unsigned int, 2195void __ext4_grp_locked_error(const char *, unsigned int,
2200 struct super_block *, ext4_group_t, 2196 struct super_block *, ext4_group_t,
2201 unsigned long, ext4_fsblk_t, 2197 unsigned long, ext4_fsblk_t,
2202 const char *, ...); 2198 const char *, ...);
2203#define ext4_grp_locked_error(sb, grp, message...) \ 2199
2204 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) 2200#ifdef CONFIG_PRINTK
2201
2202#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2203 __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
2204#define ext4_error_file(file, func, line, block, fmt, ...) \
2205 __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
2206#define ext4_error(sb, fmt, ...) \
2207 __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2208#define ext4_abort(sb, fmt, ...) \
2209 __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2210#define ext4_warning(sb, fmt, ...) \
2211 __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2212#define ext4_msg(sb, level, fmt, ...) \
2213 __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
2214#define dump_mmp_msg(sb, mmp, msg) \
2215 __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
2216#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2217 __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
2218 fmt, ##__VA_ARGS__)
2219
2220#else
2221
2222#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2223do { \
2224 no_printk(fmt, ##__VA_ARGS__); \
2225 __ext4_error_inode(inode, "", 0, block, " "); \
2226} while (0)
2227#define ext4_error_file(file, func, line, block, fmt, ...) \
2228do { \
2229 no_printk(fmt, ##__VA_ARGS__); \
2230 __ext4_error_file(file, "", 0, block, " "); \
2231} while (0)
2232#define ext4_error(sb, fmt, ...) \
2233do { \
2234 no_printk(fmt, ##__VA_ARGS__); \
2235 __ext4_error(sb, "", 0, " "); \
2236} while (0)
2237#define ext4_abort(sb, fmt, ...) \
2238do { \
2239 no_printk(fmt, ##__VA_ARGS__); \
2240 __ext4_abort(sb, "", 0, " "); \
2241} while (0)
2242#define ext4_warning(sb, fmt, ...) \
2243do { \
2244 no_printk(fmt, ##__VA_ARGS__); \
2245 __ext4_warning(sb, "", 0, " "); \
2246} while (0)
2247#define ext4_msg(sb, level, fmt, ...) \
2248do { \
2249 no_printk(fmt, ##__VA_ARGS__); \
2250 __ext4_msg(sb, "", " "); \
2251} while (0)
2252#define dump_mmp_msg(sb, mmp, msg) \
2253 __dump_mmp_msg(sb, mmp, "", 0, "")
2254#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2255do { \
2256 no_printk(fmt, ##__VA_ARGS__); \
2257 __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
2258} while (0)
2259
2260#endif
2261
2205extern void ext4_update_dynamic_rev(struct super_block *sb); 2262extern void ext4_update_dynamic_rev(struct super_block *sb);
2206extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 2263extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
2207 __u32 compat); 2264 __u32 compat);
@@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
2312{ 2369{
2313 struct ext4_group_info ***grp_info; 2370 struct ext4_group_info ***grp_info;
2314 long indexv, indexh; 2371 long indexv, indexh;
2372 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
2315 grp_info = EXT4_SB(sb)->s_group_info; 2373 grp_info = EXT4_SB(sb)->s_group_info;
2316 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); 2374 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
2317 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); 2375 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2598,8 +2656,7 @@ struct ext4_extent;
2598 2656
2599extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2657extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
2600extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2658extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
2601extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 2659extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
2602 int chunk);
2603extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2660extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2604 struct ext4_map_blocks *map, int flags); 2661 struct ext4_map_blocks *map, int flags);
2605extern void ext4_ext_truncate(handle_t *, struct inode *); 2662extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
2609extern void ext4_ext_release(struct super_block *); 2666extern void ext4_ext_release(struct super_block *);
2610extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2667extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
2611 loff_t len); 2668 loff_t len);
2612extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2669extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
2613 ssize_t len); 2670 loff_t offset, ssize_t len);
2614extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2671extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
2615 struct ext4_map_blocks *map, int flags); 2672 struct ext4_map_blocks *map, int flags);
2616extern int ext4_ext_calc_metadata_amount(struct inode *inode, 2673extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2650 2707
2651/* page-io.c */ 2708/* page-io.c */
2652extern int __init ext4_init_pageio(void); 2709extern int __init ext4_init_pageio(void);
2653extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2654extern void ext4_exit_pageio(void); 2710extern void ext4_exit_pageio(void);
2655extern void ext4_ioend_shutdown(struct inode *);
2656extern void ext4_free_io_end(ext4_io_end_t *io);
2657extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2711extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2658extern void ext4_end_io_work(struct work_struct *work); 2712extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
2713extern int ext4_put_io_end(ext4_io_end_t *io_end);
2714extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
2715extern void ext4_io_submit_init(struct ext4_io_submit *io,
2716 struct writeback_control *wbc);
2717extern void ext4_end_io_rsv_work(struct work_struct *work);
2718extern void ext4_end_io_unrsv_work(struct work_struct *work);
2659extern void ext4_io_submit(struct ext4_io_submit *io); 2719extern void ext4_io_submit(struct ext4_io_submit *io);
2660extern int ext4_bio_write_page(struct ext4_io_submit *io, 2720extern int ext4_bio_write_page(struct ext4_io_submit *io,
2661 struct page *page, 2721 struct page *page,
@@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
2668extern int ext4_mmp_csum_verify(struct super_block *sb, 2728extern int ext4_mmp_csum_verify(struct super_block *sb,
2669 struct mmp_struct *mmp); 2729 struct mmp_struct *mmp);
2670 2730
2671/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2731/*
2732 * Note that these flags will never ever appear in a buffer_head's state flag.
2733 * See EXT4_MAP_... to see where this is used.
2734 */
2672enum ext4_state_bits { 2735enum ext4_state_bits {
2673 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2736 BH_Uninit /* blocks are allocated but uninitialized on disk */
2674 = BH_JBDPrivateStart, 2737 = BH_JBDPrivateStart,
2675 BH_AllocFromCluster, /* allocated blocks were part of already 2738 BH_AllocFromCluster, /* allocated blocks were part of already
2676 * allocated cluster. Note that this flag will 2739 * allocated cluster. */
2677 * never, ever appear in a buffer_head's state
2678 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2679 * this is used. */
2680}; 2740};
2681 2741
2682BUFFER_FNS(Uninit, uninit)
2683TAS_BUFFER_FNS(Uninit, uninit)
2684
2685/* 2742/*
2686 * Add new method to test whether block and inode bitmaps are properly 2743 * Add new method to test whether block and inode bitmaps are properly
2687 * initialized. With uninit_bg reading the block from disk is not enough 2744 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
38/* 38/*
39 * Wrappers for jbd2_journal_start/end. 39 * Wrappers for jbd2_journal_start/end.
40 */ 40 */
41handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 41static int ext4_journal_check_start(struct super_block *sb)
42 int type, int nblocks)
43{ 42{
44 journal_t *journal; 43 journal_t *journal;
45 44
46 might_sleep(); 45 might_sleep();
47
48 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
49 if (sb->s_flags & MS_RDONLY) 46 if (sb->s_flags & MS_RDONLY)
50 return ERR_PTR(-EROFS); 47 return -EROFS;
51
52 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); 48 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
53 journal = EXT4_SB(sb)->s_journal; 49 journal = EXT4_SB(sb)->s_journal;
54 if (!journal)
55 return ext4_get_nojournal();
56 /* 50 /*
57 * Special case here: if the journal has aborted behind our 51 * Special case here: if the journal has aborted behind our
58 * backs (eg. EIO in the commit thread), then we still need to 52 * backs (eg. EIO in the commit thread), then we still need to
59 * take the FS itself readonly cleanly. 53 * take the FS itself readonly cleanly.
60 */ 54 */
61 if (is_journal_aborted(journal)) { 55 if (journal && is_journal_aborted(journal)) {
62 ext4_abort(sb, "Detected aborted journal"); 56 ext4_abort(sb, "Detected aborted journal");
63 return ERR_PTR(-EROFS); 57 return -EROFS;
64 } 58 }
65 return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); 59 return 0;
60}
61
62handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
63 int type, int blocks, int rsv_blocks)
64{
65 journal_t *journal;
66 int err;
67
68 trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
69 err = ext4_journal_check_start(sb);
70 if (err < 0)
71 return ERR_PTR(err);
72
73 journal = EXT4_SB(sb)->s_journal;
74 if (!journal)
75 return ext4_get_nojournal();
76 return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
77 type, line);
66} 78}
67 79
68int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) 80int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
86 return err; 98 return err;
87} 99}
88 100
101handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
102 int type)
103{
104 struct super_block *sb;
105 int err;
106
107 if (!ext4_handle_valid(handle))
108 return ext4_get_nojournal();
109
110 sb = handle->h_journal->j_private;
111 trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
112 _RET_IP_);
113 err = ext4_journal_check_start(sb);
114 if (err < 0) {
115 jbd2_journal_free_reserved(handle);
116 return ERR_PTR(err);
117 }
118
119 err = jbd2_journal_start_reserved(handle, type, line);
120 if (err < 0)
121 return ERR_PTR(err);
122 return handle;
123}
124
89void ext4_journal_abort_handle(const char *caller, unsigned int line, 125void ext4_journal_abort_handle(const char *caller, unsigned int line,
90 const char *err_fn, struct buffer_head *bh, 126 const char *err_fn, struct buffer_head *bh,
91 handle_t *handle, int err) 127 handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
134#define EXT4_HT_MIGRATE 8 134#define EXT4_HT_MIGRATE 8
135#define EXT4_HT_MOVE_EXTENTS 9 135#define EXT4_HT_MOVE_EXTENTS 9
136#define EXT4_HT_XATTR 10 136#define EXT4_HT_XATTR 10
137#define EXT4_HT_MAX 11 137#define EXT4_HT_EXT_CONVERT 11
138#define EXT4_HT_MAX 12
138 139
139/** 140/**
140 * struct ext4_journal_cb_entry - Base structure for callback information. 141 * struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
265 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) 266 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
266 267
267handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 268handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
268 int type, int nblocks); 269 int type, int blocks, int rsv_blocks);
269int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); 270int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
270 271
271#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 272#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
300} 301}
301 302
302#define ext4_journal_start_sb(sb, type, nblocks) \ 303#define ext4_journal_start_sb(sb, type, nblocks) \
303 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) 304 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
304 305
305#define ext4_journal_start(inode, type, nblocks) \ 306#define ext4_journal_start(inode, type, nblocks) \
306 __ext4_journal_start((inode), __LINE__, (type), (nblocks)) 307 __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
308
309#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
310 __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
307 311
308static inline handle_t *__ext4_journal_start(struct inode *inode, 312static inline handle_t *__ext4_journal_start(struct inode *inode,
309 unsigned int line, int type, 313 unsigned int line, int type,
310 int nblocks) 314 int blocks, int rsv_blocks)
311{ 315{
312 return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); 316 return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
317 rsv_blocks);
313} 318}
314 319
315#define ext4_journal_stop(handle) \ 320#define ext4_journal_stop(handle) \
316 __ext4_journal_stop(__func__, __LINE__, (handle)) 321 __ext4_journal_stop(__func__, __LINE__, (handle))
317 322
323#define ext4_journal_start_reserved(handle, type) \
324 __ext4_journal_start_reserved((handle), __LINE__, (type))
325
326handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
327 int type);
328
329static inline void ext4_journal_free_reserved(handle_t *handle)
330{
331 if (ext4_handle_valid(handle))
332 jbd2_journal_free_reserved(handle);
333}
334
318static inline handle_t *ext4_journal_current_handle(void) 335static inline handle_t *ext4_journal_current_handle(void)
319{ 336{
320 return journal_current_handle(); 337 return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910b9cf..7097b0f680e6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2125 next_del = ext4_find_delayed_extent(inode, &es); 2125 next_del = ext4_find_delayed_extent(inode, &es);
2126 if (!exists && next_del) { 2126 if (!exists && next_del) {
2127 exists = 1; 2127 exists = 1;
2128 flags |= FIEMAP_EXTENT_DELALLOC; 2128 flags |= (FIEMAP_EXTENT_DELALLOC |
2129 FIEMAP_EXTENT_UNKNOWN);
2129 } 2130 }
2130 up_read(&EXT4_I(inode)->i_data_sem); 2131 up_read(&EXT4_I(inode)->i_data_sem);
2131 2132
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2328} 2329}
2329 2330
2330/* 2331/*
2331 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2332 * How many index/leaf blocks need to change/allocate to add @extents extents?
2332 * 2333 *
2333 * if nrblocks are fit in a single extent (chunk flag is 1), then 2334 * If we add a single extent, then in the worse case, each tree level
2334 * in the worse case, each tree level index/leaf need to be changed 2335 * index/leaf need to be changed in case of the tree split.
2335 * if the tree split due to insert a new extent, then the old tree
2336 * index/leaf need to be updated too
2337 * 2336 *
2338 * If the nrblocks are discontiguous, they could cause 2337 * If more extents are inserted, they could cause the whole tree split more
2339 * the whole tree split more than once, but this is really rare. 2338 * than once, but this is really rare.
2340 */ 2339 */
2341int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2340int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2342{ 2341{
2343 int index; 2342 int index;
2344 int depth; 2343 int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2349 2348
2350 depth = ext_depth(inode); 2349 depth = ext_depth(inode);
2351 2350
2352 if (chunk) 2351 if (extents <= 1)
2353 index = depth * 2; 2352 index = depth * 2;
2354 else 2353 else
2355 index = depth * 3; 2354 index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2357 return index; 2356 return index;
2358} 2357}
2359 2358
2359static inline int get_default_free_blocks_flags(struct inode *inode)
2360{
2361 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2362 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2363 else if (ext4_should_journal_data(inode))
2364 return EXT4_FREE_BLOCKS_FORGET;
2365 return 0;
2366}
2367
2360static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2368static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2361 struct ext4_extent *ex, 2369 struct ext4_extent *ex,
2362 ext4_fsblk_t *partial_cluster, 2370 long long *partial_cluster,
2363 ext4_lblk_t from, ext4_lblk_t to) 2371 ext4_lblk_t from, ext4_lblk_t to)
2364{ 2372{
2365 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2373 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2366 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2374 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2367 ext4_fsblk_t pblk; 2375 ext4_fsblk_t pblk;
2368 int flags = 0; 2376 int flags = get_default_free_blocks_flags(inode);
2369
2370 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2371 flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2372 else if (ext4_should_journal_data(inode))
2373 flags |= EXT4_FREE_BLOCKS_FORGET;
2374 2377
2375 /* 2378 /*
2376 * For bigalloc file systems, we never free a partial cluster 2379 * For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2388 * partial cluster here. 2391 * partial cluster here.
2389 */ 2392 */
2390 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2393 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2391 if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2394 if ((*partial_cluster > 0) &&
2395 (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2392 ext4_free_blocks(handle, inode, NULL, 2396 ext4_free_blocks(handle, inode, NULL,
2393 EXT4_C2B(sbi, *partial_cluster), 2397 EXT4_C2B(sbi, *partial_cluster),
2394 sbi->s_cluster_ratio, flags); 2398 sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2414 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2418 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2415 /* tail removal */ 2419 /* tail removal */
2416 ext4_lblk_t num; 2420 ext4_lblk_t num;
2421 unsigned int unaligned;
2417 2422
2418 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2423 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2419 pblk = ext4_ext_pblock(ex) + ee_len - num; 2424 pblk = ext4_ext_pblock(ex) + ee_len - num;
2420 ext_debug("free last %u blocks starting %llu\n", num, pblk); 2425 /*
2426 * Usually we want to free partial cluster at the end of the
2427 * extent, except for the situation when the cluster is still
2428 * used by any other extent (partial_cluster is negative).
2429 */
2430 if (*partial_cluster < 0 &&
2431 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2432 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2433
2434 ext_debug("free last %u blocks starting %llu partial %lld\n",
2435 num, pblk, *partial_cluster);
2421 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2436 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2422 /* 2437 /*
2423 * If the block range to be freed didn't start at the 2438 * If the block range to be freed didn't start at the
2424 * beginning of a cluster, and we removed the entire 2439 * beginning of a cluster, and we removed the entire
2425 * extent, save the partial cluster here, since we 2440 * extent and the cluster is not used by any other extent,
2426 * might need to delete if we determine that the 2441 * save the partial cluster here, since we might need to
2427 * truncate operation has removed all of the blocks in 2442 * delete if we determine that the truncate operation has
2428 * the cluster. 2443 * removed all of the blocks in the cluster.
2444 *
2445 * On the other hand, if we did not manage to free the whole
2446 * extent, we have to mark the cluster as used (store negative
2447 * cluster number in partial_cluster).
2429 */ 2448 */
2430 if (pblk & (sbi->s_cluster_ratio - 1) && 2449 unaligned = pblk & (sbi->s_cluster_ratio - 1);
2431 (ee_len == num)) 2450 if (unaligned && (ee_len == num) &&
2451 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2432 *partial_cluster = EXT4_B2C(sbi, pblk); 2452 *partial_cluster = EXT4_B2C(sbi, pblk);
2433 else 2453 else if (unaligned)
2454 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2455 else if (*partial_cluster > 0)
2434 *partial_cluster = 0; 2456 *partial_cluster = 0;
2435 } else if (from == le32_to_cpu(ex->ee_block) 2457 } else
2436 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2458 ext4_error(sbi->s_sb, "strange request: removal(2) "
2437 /* head removal */ 2459 "%u-%u from %u:%u\n",
2438 ext4_lblk_t num; 2460 from, to, le32_to_cpu(ex->ee_block), ee_len);
2439 ext4_fsblk_t start;
2440
2441 num = to - from;
2442 start = ext4_ext_pblock(ex);
2443
2444 ext_debug("free first %u blocks starting %llu\n", num, start);
2445 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2446
2447 } else {
2448 printk(KERN_INFO "strange request: removal(2) "
2449 "%u-%u from %u:%u\n",
2450 from, to, le32_to_cpu(ex->ee_block), ee_len);
2451 }
2452 return 0; 2461 return 0;
2453} 2462}
2454 2463
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2461 * @handle: The journal handle 2470 * @handle: The journal handle
2462 * @inode: The files inode 2471 * @inode: The files inode
2463 * @path: The path to the leaf 2472 * @path: The path to the leaf
2473 * @partial_cluster: The cluster which we'll have to free if all extents
2474 * has been released from it. It gets negative in case
2475 * that the cluster is still used.
2464 * @start: The first block to remove 2476 * @start: The first block to remove
2465 * @end: The last block to remove 2477 * @end: The last block to remove
2466 */ 2478 */
2467static int 2479static int
2468ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2480ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2469 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, 2481 struct ext4_ext_path *path,
2482 long long *partial_cluster,
2470 ext4_lblk_t start, ext4_lblk_t end) 2483 ext4_lblk_t start, ext4_lblk_t end)
2471{ 2484{
2472 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2485 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2479 unsigned short ex_ee_len; 2492 unsigned short ex_ee_len;
2480 unsigned uninitialized = 0; 2493 unsigned uninitialized = 0;
2481 struct ext4_extent *ex; 2494 struct ext4_extent *ex;
2495 ext4_fsblk_t pblk;
2482 2496
2483 /* the header must be checked already in ext4_ext_remove_space() */ 2497 /* the header must be checked already in ext4_ext_remove_space() */
2484 ext_debug("truncate since %u in leaf to %u\n", start, end); 2498 ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2490 return -EIO; 2504 return -EIO;
2491 } 2505 }
2492 /* find where to start removing */ 2506 /* find where to start removing */
2493 ex = EXT_LAST_EXTENT(eh); 2507 ex = path[depth].p_ext;
2508 if (!ex)
2509 ex = EXT_LAST_EXTENT(eh);
2494 2510
2495 ex_ee_block = le32_to_cpu(ex->ee_block); 2511 ex_ee_block = le32_to_cpu(ex->ee_block);
2496 ex_ee_len = ext4_ext_get_actual_len(ex); 2512 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2517 2533
2518 /* If this extent is beyond the end of the hole, skip it */ 2534 /* If this extent is beyond the end of the hole, skip it */
2519 if (end < ex_ee_block) { 2535 if (end < ex_ee_block) {
2536 /*
2537 * We're going to skip this extent and move to another,
2538 * so if this extent is not cluster aligned we have
2539 * to mark the current cluster as used to avoid
2540 * accidentally freeing it later on
2541 */
2542 pblk = ext4_ext_pblock(ex);
2543 if (pblk & (sbi->s_cluster_ratio - 1))
2544 *partial_cluster =
2545 -((long long)EXT4_B2C(sbi, pblk));
2520 ex--; 2546 ex--;
2521 ex_ee_block = le32_to_cpu(ex->ee_block); 2547 ex_ee_block = le32_to_cpu(ex->ee_block);
2522 ex_ee_len = ext4_ext_get_actual_len(ex); 2548 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2592 sizeof(struct ext4_extent)); 2618 sizeof(struct ext4_extent));
2593 } 2619 }
2594 le16_add_cpu(&eh->eh_entries, -1); 2620 le16_add_cpu(&eh->eh_entries, -1);
2595 } else 2621 } else if (*partial_cluster > 0)
2596 *partial_cluster = 0; 2622 *partial_cluster = 0;
2597 2623
2598 err = ext4_ext_dirty(handle, inode, path + depth); 2624 err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2610 err = ext4_ext_correct_indexes(handle, inode, path); 2636 err = ext4_ext_correct_indexes(handle, inode, path);
2611 2637
2612 /* 2638 /*
2613 * If there is still a entry in the leaf node, check to see if 2639 * Free the partial cluster only if the current extent does not
2614 * it references the partial cluster. This is the only place 2640 * reference it. Otherwise we might free used cluster.
2615 * where it could; if it doesn't, we can free the cluster.
2616 */ 2641 */
2617 if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && 2642 if (*partial_cluster > 0 &&
2618 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2643 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2619 *partial_cluster)) { 2644 *partial_cluster)) {
2620 int flags = EXT4_FREE_BLOCKS_FORGET; 2645 int flags = get_default_free_blocks_flags(inode);
2621
2622 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2623 flags |= EXT4_FREE_BLOCKS_METADATA;
2624 2646
2625 ext4_free_blocks(handle, inode, NULL, 2647 ext4_free_blocks(handle, inode, NULL,
2626 EXT4_C2B(sbi, *partial_cluster), 2648 EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2664 struct super_block *sb = inode->i_sb; 2686 struct super_block *sb = inode->i_sb;
2665 int depth = ext_depth(inode); 2687 int depth = ext_depth(inode);
2666 struct ext4_ext_path *path = NULL; 2688 struct ext4_ext_path *path = NULL;
2667 ext4_fsblk_t partial_cluster = 0; 2689 long long partial_cluster = 0;
2668 handle_t *handle; 2690 handle_t *handle;
2669 int i = 0, err = 0; 2691 int i = 0, err = 0;
2670 2692
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2676 return PTR_ERR(handle); 2698 return PTR_ERR(handle);
2677 2699
2678again: 2700again:
2679 trace_ext4_ext_remove_space(inode, start, depth); 2701 trace_ext4_ext_remove_space(inode, start, end, depth);
2680 2702
2681 /* 2703 /*
2682 * Check if we are removing extents inside the extent tree. If that 2704 * Check if we are removing extents inside the extent tree. If that
@@ -2844,17 +2866,14 @@ again:
2844 } 2866 }
2845 } 2867 }
2846 2868
2847 trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, 2869 trace_ext4_ext_remove_space_done(inode, start, end, depth,
2848 path->p_hdr->eh_entries); 2870 partial_cluster, path->p_hdr->eh_entries);
2849 2871
2850 /* If we still have something in the partial cluster and we have removed 2872 /* If we still have something in the partial cluster and we have removed
2851 * even the first extent, then we should free the blocks in the partial 2873 * even the first extent, then we should free the blocks in the partial
2852 * cluster as well. */ 2874 * cluster as well. */
2853 if (partial_cluster && path->p_hdr->eh_entries == 0) { 2875 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2854 int flags = EXT4_FREE_BLOCKS_FORGET; 2876 int flags = get_default_free_blocks_flags(inode);
2855
2856 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2857 flags |= EXT4_FREE_BLOCKS_METADATA;
2858 2877
2859 ext4_free_blocks(handle, inode, NULL, 2878 ext4_free_blocks(handle, inode, NULL,
2860 EXT4_C2B(EXT4_SB(sb), partial_cluster), 2879 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -4363,7 +4382,7 @@ out2:
4363 } 4382 }
4364 4383
4365out3: 4384out3:
4366 trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); 4385 trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
4367 4386
4368 return err ? err : allocated; 4387 return err ? err : allocated;
4369} 4388}
@@ -4446,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4446 return -EOPNOTSUPP; 4465 return -EOPNOTSUPP;
4447 4466
4448 if (mode & FALLOC_FL_PUNCH_HOLE) 4467 if (mode & FALLOC_FL_PUNCH_HOLE)
4449 return ext4_punch_hole(file, offset, len); 4468 return ext4_punch_hole(inode, offset, len);
4450 4469
4451 ret = ext4_convert_inline_data(inode); 4470 ret = ext4_convert_inline_data(inode);
4452 if (ret) 4471 if (ret)
@@ -4548,10 +4567,9 @@ retry:
4548 * function, to convert the fallocated extents after IO is completed. 4567 * function, to convert the fallocated extents after IO is completed.
4549 * Returns 0 on success. 4568 * Returns 0 on success.
4550 */ 4569 */
4551int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 4570int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4552 ssize_t len) 4571 loff_t offset, ssize_t len)
4553{ 4572{
4554 handle_t *handle;
4555 unsigned int max_blocks; 4573 unsigned int max_blocks;
4556 int ret = 0; 4574 int ret = 0;
4557 int ret2 = 0; 4575 int ret2 = 0;
@@ -4566,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4566 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4584 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4567 map.m_lblk); 4585 map.m_lblk);
4568 /* 4586 /*
4569 * credits to insert 1 extent into extent tree 4587 * This is somewhat ugly but the idea is clear: When transaction is
4588 * reserved, everything goes into it. Otherwise we rather start several
4589 * smaller transactions for conversion of each extent separately.
4570 */ 4590 */
4571 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4591 if (handle) {
4592 handle = ext4_journal_start_reserved(handle,
4593 EXT4_HT_EXT_CONVERT);
4594 if (IS_ERR(handle))
4595 return PTR_ERR(handle);
4596 credits = 0;
4597 } else {
4598 /*
4599 * credits to insert 1 extent into extent tree
4600 */
4601 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4602 }
4572 while (ret >= 0 && ret < max_blocks) { 4603 while (ret >= 0 && ret < max_blocks) {
4573 map.m_lblk += ret; 4604 map.m_lblk += ret;
4574 map.m_len = (max_blocks -= ret); 4605 map.m_len = (max_blocks -= ret);
4575 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 4606 if (credits) {
4576 if (IS_ERR(handle)) { 4607 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4577 ret = PTR_ERR(handle); 4608 credits);
4578 break; 4609 if (IS_ERR(handle)) {
4610 ret = PTR_ERR(handle);
4611 break;
4612 }
4579 } 4613 }
4580 ret = ext4_map_blocks(handle, inode, &map, 4614 ret = ext4_map_blocks(handle, inode, &map,
4581 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4615 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4586 inode->i_ino, map.m_lblk, 4620 inode->i_ino, map.m_lblk,
4587 map.m_len, ret); 4621 map.m_len, ret);
4588 ext4_mark_inode_dirty(handle, inode); 4622 ext4_mark_inode_dirty(handle, inode);
4589 ret2 = ext4_journal_stop(handle); 4623 if (credits)
4590 if (ret <= 0 || ret2 ) 4624 ret2 = ext4_journal_stop(handle);
4625 if (ret <= 0 || ret2)
4591 break; 4626 break;
4592 } 4627 }
4628 if (!credits)
4629 ret2 = ext4_journal_stop(handle);
4593 return ret > 0 ? ret2 : ret; 4630 return ret > 0 ? ret2 : ret;
4594} 4631}
4595 4632
@@ -4659,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4659 error = ext4_get_inode_loc(inode, &iloc); 4696 error = ext4_get_inode_loc(inode, &iloc);
4660 if (error) 4697 if (error)
4661 return error; 4698 return error;
4662 physical = iloc.bh->b_blocknr << blockbits; 4699 physical = (__u64)iloc.bh->b_blocknr << blockbits;
4663 offset = EXT4_GOOD_OLD_INODE_SIZE + 4700 offset = EXT4_GOOD_OLD_INODE_SIZE +
4664 EXT4_I(inode)->i_extra_isize; 4701 EXT4_I(inode)->i_extra_isize;
4665 physical += offset; 4702 physical += offset;
@@ -4667,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4667 flags |= FIEMAP_EXTENT_DATA_INLINE; 4704 flags |= FIEMAP_EXTENT_DATA_INLINE;
4668 brelse(iloc.bh); 4705 brelse(iloc.bh);
4669 } else { /* external block */ 4706 } else { /* external block */
4670 physical = EXT4_I(inode)->i_file_acl << blockbits; 4707 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4671 length = inode->i_sb->s_blocksize; 4708 length = inode->i_sb->s_blocksize;
4672 } 4709 }
4673 4710
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..ee018d5f397e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
10 * Ext4 extents status tree core functions. 10 * Ext4 extents status tree core functions.
11 */ 11 */
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/list_sort.h>
13#include "ext4.h" 14#include "ext4.h"
14#include "extents_status.h" 15#include "extents_status.h"
15#include "ext4_extents.h" 16#include "ext4_extents.h"
@@ -291,7 +292,6 @@ out:
291 292
292 read_unlock(&EXT4_I(inode)->i_es_lock); 293 read_unlock(&EXT4_I(inode)->i_es_lock);
293 294
294 ext4_es_lru_add(inode);
295 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 295 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
296} 296}
297 297
@@ -672,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
672error: 672error:
673 write_unlock(&EXT4_I(inode)->i_es_lock); 673 write_unlock(&EXT4_I(inode)->i_es_lock);
674 674
675 ext4_es_lru_add(inode);
676 ext4_es_print_tree(inode); 675 ext4_es_print_tree(inode);
677 676
678 return err; 677 return err;
@@ -734,7 +733,6 @@ out:
734 733
735 read_unlock(&EXT4_I(inode)->i_es_lock); 734 read_unlock(&EXT4_I(inode)->i_es_lock);
736 735
737 ext4_es_lru_add(inode);
738 trace_ext4_es_lookup_extent_exit(inode, es, found); 736 trace_ext4_es_lookup_extent_exit(inode, es, found);
739 return found; 737 return found;
740} 738}
@@ -878,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
878 EXTENT_STATUS_WRITTEN); 876 EXTENT_STATUS_WRITTEN);
879} 877}
880 878
879static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
880 struct list_head *b)
881{
882 struct ext4_inode_info *eia, *eib;
883 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
884 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
885
886 if (eia->i_touch_when == eib->i_touch_when)
887 return 0;
888 if (time_after(eia->i_touch_when, eib->i_touch_when))
889 return 1;
890 else
891 return -1;
892}
893
881static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 894static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
882{ 895{
883 struct ext4_sb_info *sbi = container_of(shrink, 896 struct ext4_sb_info *sbi = container_of(shrink,
884 struct ext4_sb_info, s_es_shrinker); 897 struct ext4_sb_info, s_es_shrinker);
885 struct ext4_inode_info *ei; 898 struct ext4_inode_info *ei;
886 struct list_head *cur, *tmp, scanned; 899 struct list_head *cur, *tmp;
900 LIST_HEAD(skiped);
887 int nr_to_scan = sc->nr_to_scan; 901 int nr_to_scan = sc->nr_to_scan;
888 int ret, nr_shrunk = 0; 902 int ret, nr_shrunk = 0;
889 903
@@ -893,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
893 if (!nr_to_scan) 907 if (!nr_to_scan)
894 return ret; 908 return ret;
895 909
896 INIT_LIST_HEAD(&scanned);
897
898 spin_lock(&sbi->s_es_lru_lock); 910 spin_lock(&sbi->s_es_lru_lock);
911
912 /*
913 * If the inode that is at the head of LRU list is newer than
914 * last_sorted time, that means that we need to sort this list.
915 */
916 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
917 if (sbi->s_es_last_sorted < ei->i_touch_when) {
918 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
919 sbi->s_es_last_sorted = jiffies;
920 }
921
899 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 922 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
900 list_move_tail(cur, &scanned); 923 /*
924 * If we have already reclaimed all extents from extent
925 * status tree, just stop the loop immediately.
926 */
927 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
928 break;
901 929
902 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 930 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
903 931
904 read_lock(&ei->i_es_lock); 932 /* Skip the inode that is newer than the last_sorted time */
905 if (ei->i_es_lru_nr == 0) { 933 if (sbi->s_es_last_sorted < ei->i_touch_when) {
906 read_unlock(&ei->i_es_lock); 934 list_move_tail(cur, &skiped);
907 continue; 935 continue;
908 } 936 }
909 read_unlock(&ei->i_es_lock); 937
938 if (ei->i_es_lru_nr == 0)
939 continue;
910 940
911 write_lock(&ei->i_es_lock); 941 write_lock(&ei->i_es_lock);
912 ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 942 ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
943 if (ei->i_es_lru_nr == 0)
944 list_del_init(&ei->i_es_lru);
913 write_unlock(&ei->i_es_lock); 945 write_unlock(&ei->i_es_lock);
914 946
915 nr_shrunk += ret; 947 nr_shrunk += ret;
@@ -917,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
917 if (nr_to_scan == 0) 949 if (nr_to_scan == 0)
918 break; 950 break;
919 } 951 }
920 list_splice_tail(&scanned, &sbi->s_es_lru); 952
953 /* Move the newer inodes into the tail of the LRU list. */
954 list_splice_tail(&skiped, &sbi->s_es_lru);
921 spin_unlock(&sbi->s_es_lru_lock); 955 spin_unlock(&sbi->s_es_lru_lock);
922 956
923 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 957 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -925,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
925 return ret; 959 return ret;
926} 960}
927 961
928void ext4_es_register_shrinker(struct super_block *sb) 962void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
929{ 963{
930 struct ext4_sb_info *sbi;
931
932 sbi = EXT4_SB(sb);
933 INIT_LIST_HEAD(&sbi->s_es_lru); 964 INIT_LIST_HEAD(&sbi->s_es_lru);
934 spin_lock_init(&sbi->s_es_lru_lock); 965 spin_lock_init(&sbi->s_es_lru_lock);
966 sbi->s_es_last_sorted = 0;
935 sbi->s_es_shrinker.shrink = ext4_es_shrink; 967 sbi->s_es_shrinker.shrink = ext4_es_shrink;
936 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 968 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
937 register_shrinker(&sbi->s_es_shrinker); 969 register_shrinker(&sbi->s_es_shrinker);
938} 970}
939 971
940void ext4_es_unregister_shrinker(struct super_block *sb) 972void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
941{ 973{
942 unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); 974 unregister_shrinker(&sbi->s_es_shrinker);
943} 975}
944 976
945void ext4_es_lru_add(struct inode *inode) 977void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode)
947 struct ext4_inode_info *ei = EXT4_I(inode); 979 struct ext4_inode_info *ei = EXT4_I(inode);
948 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 980 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
949 981
982 ei->i_touch_when = jiffies;
983
984 if (!list_empty(&ei->i_es_lru))
985 return;
986
950 spin_lock(&sbi->s_es_lru_lock); 987 spin_lock(&sbi->s_es_lru_lock);
951 if (list_empty(&ei->i_es_lru)) 988 if (list_empty(&ei->i_es_lru))
952 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 989 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
953 else
954 list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
955 spin_unlock(&sbi->s_es_lru_lock); 990 spin_unlock(&sbi->s_es_lru_lock);
956} 991}
957 992
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
39 EXTENT_STATUS_DELAYED | \ 39 EXTENT_STATUS_DELAYED | \
40 EXTENT_STATUS_HOLE) 40 EXTENT_STATUS_HOLE)
41 41
42struct ext4_sb_info;
42struct ext4_extent; 43struct ext4_extent;
43 44
44struct extent_status { 45struct extent_status {
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
119 es->es_pblk = block; 120 es->es_pblk = block;
120} 121}
121 122
122extern void ext4_es_register_shrinker(struct super_block *sb); 123extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
123extern void ext4_es_unregister_shrinker(struct super_block *sb); 124extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
124extern void ext4_es_lru_add(struct inode *inode); 125extern void ext4_es_lru_add(struct inode *inode);
125extern void ext4_es_lru_del(struct inode *inode); 126extern void ext4_es_lru_del(struct inode *inode);
126 127
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1b4d51b5d86..b19f0a457f32 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
312 blkbits = inode->i_sb->s_blocksize_bits; 312 blkbits = inode->i_sb->s_blocksize_bits;
313 startoff = *offset; 313 startoff = *offset;
314 lastoff = startoff; 314 lastoff = startoff;
315 endoff = (map->m_lblk + map->m_len) << blkbits; 315 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
316 316
317 index = startoff >> PAGE_CACHE_SHIFT; 317 index = startoff >> PAGE_CACHE_SHIFT;
318 end = endoff >> PAGE_CACHE_SHIFT; 318 end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
457 ret = ext4_map_blocks(NULL, inode, &map, 0); 457 ret = ext4_map_blocks(NULL, inode, &map, 0);
458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
459 if (last != start) 459 if (last != start)
460 dataoff = last << blkbits; 460 dataoff = (loff_t)last << blkbits;
461 break; 461 break;
462 } 462 }
463 463
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
468 ext4_es_find_delayed_extent_range(inode, last, last, &es); 468 ext4_es_find_delayed_extent_range(inode, last, last, &es);
469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
470 if (last != start) 470 if (last != start)
471 dataoff = last << blkbits; 471 dataoff = (loff_t)last << blkbits;
472 break; 472 break;
473 } 473 }
474 474
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
486 } 486 }
487 487
488 last++; 488 last++;
489 dataoff = last << blkbits; 489 dataoff = (loff_t)last << blkbits;
490 } while (last <= end); 490 } while (last <= end);
491 491
492 mutex_unlock(&inode->i_mutex); 492 mutex_unlock(&inode->i_mutex);
@@ -540,7 +540,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
540 ret = ext4_map_blocks(NULL, inode, &map, 0); 540 ret = ext4_map_blocks(NULL, inode, &map, 0);
541 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 541 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
542 last += ret; 542 last += ret;
543 holeoff = last << blkbits; 543 holeoff = (loff_t)last << blkbits;
544 continue; 544 continue;
545 } 545 }
546 546
@@ -551,7 +551,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
551 ext4_es_find_delayed_extent_range(inode, last, last, &es); 551 ext4_es_find_delayed_extent_range(inode, last, last, &es);
552 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 552 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
553 last = es.es_lblk + es.es_len; 553 last = es.es_lblk + es.es_len;
554 holeoff = last << blkbits; 554 holeoff = (loff_t)last << blkbits;
555 continue; 555 continue;
556 } 556 }
557 557
@@ -566,7 +566,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
566 &map, &holeoff); 566 &map, &holeoff);
567 if (!unwritten) { 567 if (!unwritten) {
568 last += ret; 568 last += ret;
569 holeoff = last << blkbits; 569 holeoff = (loff_t)last << blkbits;
570 continue; 570 continue;
571 } 571 }
572 } 572 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
73 return ret; 73 return ret;
74} 74}
75 75
76/**
77 * __sync_file - generic_file_fsync without the locking and filemap_write
78 * @inode: inode to sync
79 * @datasync: only sync essential metadata if true
80 *
81 * This is just generic_file_fsync without the locking. This is needed for
82 * nojournal mode to make sure this inodes data/metadata makes it to disk
83 * properly. The i_mutex should be held already.
84 */
85static int __sync_inode(struct inode *inode, int datasync)
86{
87 int err;
88 int ret;
89
90 ret = sync_mapping_buffers(inode->i_mapping);
91 if (!(inode->i_state & I_DIRTY))
92 return ret;
93 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
94 return ret;
95
96 err = sync_inode_metadata(inode, 1);
97 if (ret == 0)
98 ret = err;
99 return ret;
100}
101
102/* 76/*
103 * akpm: A new design for ext4_sync_file(). 77 * akpm: A new design for ext4_sync_file().
104 * 78 *
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
116 struct inode *inode = file->f_mapping->host; 90 struct inode *inode = file->f_mapping->host;
117 struct ext4_inode_info *ei = EXT4_I(inode); 91 struct ext4_inode_info *ei = EXT4_I(inode);
118 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 92 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
119 int ret, err; 93 int ret = 0, err;
120 tid_t commit_tid; 94 tid_t commit_tid;
121 bool needs_barrier = false; 95 bool needs_barrier = false;
122 96
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
124 98
125 trace_ext4_sync_file_enter(file, datasync); 99 trace_ext4_sync_file_enter(file, datasync);
126 100
127 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 101 if (inode->i_sb->s_flags & MS_RDONLY) {
128 if (ret) 102 /* Make sure that we read updated s_mount_flags value */
129 return ret; 103 smp_rmb();
130 mutex_lock(&inode->i_mutex); 104 if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
131 105 ret = -EROFS;
132 if (inode->i_sb->s_flags & MS_RDONLY)
133 goto out;
134
135 ret = ext4_flush_unwritten_io(inode);
136 if (ret < 0)
137 goto out; 106 goto out;
107 }
138 108
139 if (!journal) { 109 if (!journal) {
140 ret = __sync_inode(inode, datasync); 110 ret = generic_file_fsync(file, start, end, datasync);
141 if (!ret && !hlist_empty(&inode->i_dentry)) 111 if (!ret && !hlist_empty(&inode->i_dentry))
142 ret = ext4_sync_parent(inode); 112 ret = ext4_sync_parent(inode);
143 goto out; 113 goto out;
144 } 114 }
145 115
116 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
117 if (ret)
118 return ret;
146 /* 119 /*
147 * data=writeback,ordered: 120 * data=writeback,ordered:
148 * The caller's filemap_fdatawrite()/wait will sync the data. 121 * The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
172 if (!ret) 145 if (!ret)
173 ret = err; 146 ret = err;
174 } 147 }
175 out: 148out:
176 mutex_unlock(&inode->i_mutex);
177 trace_ext4_sync_file_exit(inode, ret); 149 trace_ext4_sync_file_exit(inode, ret);
178 return ret; 150 return ret;
179} 151}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..f03598c6ffd3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -747,7 +747,8 @@ repeat_in_this_group:
747 if (!handle) { 747 if (!handle) {
748 BUG_ON(nblocks <= 0); 748 BUG_ON(nblocks <= 0);
749 handle = __ext4_journal_start_sb(dir->i_sb, line_no, 749 handle = __ext4_journal_start_sb(dir->i_sb, line_no,
750 handle_type, nblocks); 750 handle_type, nblocks,
751 0);
751 if (IS_ERR(handle)) { 752 if (IS_ERR(handle)) {
752 err = PTR_ERR(handle); 753 err = PTR_ERR(handle);
753 ext4_std_error(sb, err); 754 ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
624 partial--; 624 partial--;
625 } 625 }
626out: 626out:
627 trace_ext4_ind_map_blocks_exit(inode, map, err); 627 trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
628 return err; 628 return err;
629} 629}
630 630
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
675 675
676retry: 676retry:
677 if (rw == READ && ext4_should_dioread_nolock(inode)) { 677 if (rw == READ && ext4_should_dioread_nolock(inode)) {
678 if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
679 mutex_lock(&inode->i_mutex);
680 ext4_flush_unwritten_io(inode);
681 mutex_unlock(&inode->i_mutex);
682 }
683 /* 678 /*
684 * Nolock dioread optimization may be dynamically disabled 679 * Nolock dioread optimization may be dynamically disabled
685 * via ext4_inode_block_unlocked_dio(). Check inode's state 680 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
779 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 774 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
780} 775}
781 776
782int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) 777/*
778 * Calculate number of indirect blocks touched by mapping @nrblocks logically
779 * contiguous blocks
780 */
781int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
783{ 782{
784 int indirects;
785
786 /* if nrblocks are contiguous */
787 if (chunk) {
788 /*
789 * With N contiguous data blocks, we need at most
790 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
791 * 2 dindirect blocks, and 1 tindirect block
792 */
793 return DIV_ROUND_UP(nrblocks,
794 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
795 }
796 /* 783 /*
797 * if nrblocks are not contiguous, worse case, each block touch 784 * With N contiguous data blocks, we need at most
798 * a indirect block, and each indirect block touch a double indirect 785 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
799 * block, plus a triple indirect block 786 * 2 dindirect blocks, and 1 tindirect block
800 */ 787 */
801 indirects = nrblocks * 2 + 1; 788 return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
802 return indirects;
803} 789}
804 790
805/* 791/*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
940 __le32 *last) 926 __le32 *last)
941{ 927{
942 __le32 *p; 928 __le32 *p;
943 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 929 int flags = EXT4_FREE_BLOCKS_VALIDATED;
944 int err; 930 int err;
945 931
946 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 932 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
947 flags |= EXT4_FREE_BLOCKS_METADATA; 933 flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
934 else if (ext4_should_journal_data(inode))
935 flags |= EXT4_FREE_BLOCKS_FORGET;
948 936
949 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 937 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
950 count)) { 938 count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1a346a6bdc8f..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
72 entry = (struct ext4_xattr_entry *) 72 entry = (struct ext4_xattr_entry *)
73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off); 73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
74 74
75 free += le32_to_cpu(entry->e_value_size); 75 free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
76 goto out; 76 goto out;
77 } 77 }
78 78
@@ -1810,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
1810 if (error) 1810 if (error)
1811 goto out; 1811 goto out;
1812 1812
1813 physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1813 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1814 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1814 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1815 physical += offsetof(struct ext4_inode, i_block); 1815 physical += offsetof(struct ext4_inode, i_block);
1816 length = i_size_read(inode); 1816 length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b89ecbd..0188e65e1f58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
132 new_size); 132 new_size);
133} 133}
134 134
135static void ext4_invalidatepage(struct page *page, unsigned long offset); 135static void ext4_invalidatepage(struct page *page, unsigned int offset,
136 unsigned int length);
136static int __ext4_journalled_writepage(struct page *page, unsigned int len); 137static int __ext4_journalled_writepage(struct page *page, unsigned int len);
137static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 138static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
138static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 139static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
139 struct inode *inode, struct page *page, loff_t from, 140 int pextents);
140 loff_t length, int flags);
141 141
142/* 142/*
143 * Test whether an inode is a fast symlink. 143 * Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
215 filemap_write_and_wait(&inode->i_data); 215 filemap_write_and_wait(&inode->i_data);
216 } 216 }
217 truncate_inode_pages(&inode->i_data, 0); 217 truncate_inode_pages(&inode->i_data, 0);
218 ext4_ioend_shutdown(inode); 218
219 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
219 goto no_delete; 220 goto no_delete;
220 } 221 }
221 222
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
225 if (ext4_should_order_data(inode)) 226 if (ext4_should_order_data(inode))
226 ext4_begin_ordered_truncate(inode, 0); 227 ext4_begin_ordered_truncate(inode, 0);
227 truncate_inode_pages(&inode->i_data, 0); 228 truncate_inode_pages(&inode->i_data, 0);
228 ext4_ioend_shutdown(inode);
229 229
230 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
230 if (is_bad_inode(inode)) 231 if (is_bad_inode(inode))
231 goto no_delete; 232 goto no_delete;
232 233
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
423#define check_block_validity(inode, map) \ 424#define check_block_validity(inode, map) \
424 __check_block_validity((inode), __func__, __LINE__, (map)) 425 __check_block_validity((inode), __func__, __LINE__, (map))
425 426
426/*
427 * Return the number of contiguous dirty pages in a given inode
428 * starting at page frame idx.
429 */
430static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
431 unsigned int max_pages)
432{
433 struct address_space *mapping = inode->i_mapping;
434 pgoff_t index;
435 struct pagevec pvec;
436 pgoff_t num = 0;
437 int i, nr_pages, done = 0;
438
439 if (max_pages == 0)
440 return 0;
441 pagevec_init(&pvec, 0);
442 while (!done) {
443 index = idx;
444 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
445 PAGECACHE_TAG_DIRTY,
446 (pgoff_t)PAGEVEC_SIZE);
447 if (nr_pages == 0)
448 break;
449 for (i = 0; i < nr_pages; i++) {
450 struct page *page = pvec.pages[i];
451 struct buffer_head *bh, *head;
452
453 lock_page(page);
454 if (unlikely(page->mapping != mapping) ||
455 !PageDirty(page) ||
456 PageWriteback(page) ||
457 page->index != idx) {
458 done = 1;
459 unlock_page(page);
460 break;
461 }
462 if (page_has_buffers(page)) {
463 bh = head = page_buffers(page);
464 do {
465 if (!buffer_delay(bh) &&
466 !buffer_unwritten(bh))
467 done = 1;
468 bh = bh->b_this_page;
469 } while (!done && (bh != head));
470 }
471 unlock_page(page);
472 if (done)
473 break;
474 idx++;
475 num++;
476 if (num >= max_pages) {
477 done = 1;
478 break;
479 }
480 }
481 pagevec_release(&pvec);
482 }
483 return num;
484}
485
486#ifdef ES_AGGRESSIVE_TEST 427#ifdef ES_AGGRESSIVE_TEST
487static void ext4_map_blocks_es_recheck(handle_t *handle, 428static void ext4_map_blocks_es_recheck(handle_t *handle,
488 struct inode *inode, 429 struct inode *inode,
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
573 "logical block %lu\n", inode->i_ino, flags, map->m_len, 514 "logical block %lu\n", inode->i_ino, flags, map->m_len,
574 (unsigned long) map->m_lblk); 515 (unsigned long) map->m_lblk);
575 516
517 ext4_es_lru_add(inode);
518
576 /* Lookup extent status tree firstly */ 519 /* Lookup extent status tree firstly */
577 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 520 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
578 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 521 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file,
1118 } 1061 }
1119 } 1062 }
1120 1063
1121 if (ext4_has_inline_data(inode)) 1064 if (ext4_has_inline_data(inode)) {
1122 copied = ext4_write_inline_data_end(inode, pos, len, 1065 ret = ext4_write_inline_data_end(inode, pos, len,
1123 copied, page); 1066 copied, page);
1124 else 1067 if (ret < 0)
1068 goto errout;
1069 copied = ret;
1070 } else
1125 copied = block_write_end(file, mapping, pos, 1071 copied = block_write_end(file, mapping, pos,
1126 len, copied, page, fsdata); 1072 len, copied, page, fsdata);
1127 1073
@@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file,
1157 if (i_size_changed) 1103 if (i_size_changed)
1158 ext4_mark_inode_dirty(handle, inode); 1104 ext4_mark_inode_dirty(handle, inode);
1159 1105
1160 if (copied < 0)
1161 ret = copied;
1162 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1106 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1163 /* if we have allocated more blocks and copied 1107 /* if we have allocated more blocks and copied
1164 * less. We will have blocks allocated outside 1108 * less. We will have blocks allocated outside
@@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1415} 1359}
1416 1360
1417static void ext4_da_page_release_reservation(struct page *page, 1361static void ext4_da_page_release_reservation(struct page *page,
1418 unsigned long offset) 1362 unsigned int offset,
1363 unsigned int length)
1419{ 1364{
1420 int to_release = 0; 1365 int to_release = 0;
1421 struct buffer_head *head, *bh; 1366 struct buffer_head *head, *bh;
1422 unsigned int curr_off = 0; 1367 unsigned int curr_off = 0;
1423 struct inode *inode = page->mapping->host; 1368 struct inode *inode = page->mapping->host;
1424 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1369 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1370 unsigned int stop = offset + length;
1425 int num_clusters; 1371 int num_clusters;
1426 ext4_fsblk_t lblk; 1372 ext4_fsblk_t lblk;
1427 1373
1374 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1375
1428 head = page_buffers(page); 1376 head = page_buffers(page);
1429 bh = head; 1377 bh = head;
1430 do { 1378 do {
1431 unsigned int next_off = curr_off + bh->b_size; 1379 unsigned int next_off = curr_off + bh->b_size;
1432 1380
1381 if (next_off > stop)
1382 break;
1383
1433 if ((offset <= curr_off) && (buffer_delay(bh))) { 1384 if ((offset <= curr_off) && (buffer_delay(bh))) {
1434 to_release++; 1385 to_release++;
1435 clear_buffer_delay(bh); 1386 clear_buffer_delay(bh);
@@ -1460,140 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page,
1460 * Delayed allocation stuff 1411 * Delayed allocation stuff
1461 */ 1412 */
1462 1413
1463/* 1414struct mpage_da_data {
1464 * mpage_da_submit_io - walks through extent of pages and try to write 1415 struct inode *inode;
1465 * them with writepage() call back 1416 struct writeback_control *wbc;
1466 *
1467 * @mpd->inode: inode
1468 * @mpd->first_page: first page of the extent
1469 * @mpd->next_page: page after the last page of the extent
1470 *
1471 * By the time mpage_da_submit_io() is called we expect all blocks
1472 * to be allocated. this may be wrong if allocation failed.
1473 *
1474 * As pages are already locked by write_cache_pages(), we can't use it
1475 */
1476static int mpage_da_submit_io(struct mpage_da_data *mpd,
1477 struct ext4_map_blocks *map)
1478{
1479 struct pagevec pvec;
1480 unsigned long index, end;
1481 int ret = 0, err, nr_pages, i;
1482 struct inode *inode = mpd->inode;
1483 struct address_space *mapping = inode->i_mapping;
1484 loff_t size = i_size_read(inode);
1485 unsigned int len, block_start;
1486 struct buffer_head *bh, *page_bufs = NULL;
1487 sector_t pblock = 0, cur_logical = 0;
1488 struct ext4_io_submit io_submit;
1489 1417
1490 BUG_ON(mpd->next_page <= mpd->first_page); 1418 pgoff_t first_page; /* The first page to write */
1491 memset(&io_submit, 0, sizeof(io_submit)); 1419 pgoff_t next_page; /* Current page to examine */
1420 pgoff_t last_page; /* Last page to examine */
1492 /* 1421 /*
1493 * We need to start from the first_page to the next_page - 1 1422 * Extent to map - this can be after first_page because that can be
1494 * to make sure we also write the mapped dirty buffer_heads. 1423 * fully mapped. We somewhat abuse m_flags to store whether the extent
1495 * If we look at mpd->b_blocknr we would only be looking 1424 * is delalloc or unwritten.
1496 * at the currently mapped buffer_heads.
1497 */ 1425 */
1498 index = mpd->first_page; 1426 struct ext4_map_blocks map;
1499 end = mpd->next_page - 1; 1427 struct ext4_io_submit io_submit; /* IO submission data */
1500 1428};
1501 pagevec_init(&pvec, 0);
1502 while (index <= end) {
1503 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1504 if (nr_pages == 0)
1505 break;
1506 for (i = 0; i < nr_pages; i++) {
1507 int skip_page = 0;
1508 struct page *page = pvec.pages[i];
1509
1510 index = page->index;
1511 if (index > end)
1512 break;
1513
1514 if (index == size >> PAGE_CACHE_SHIFT)
1515 len = size & ~PAGE_CACHE_MASK;
1516 else
1517 len = PAGE_CACHE_SIZE;
1518 if (map) {
1519 cur_logical = index << (PAGE_CACHE_SHIFT -
1520 inode->i_blkbits);
1521 pblock = map->m_pblk + (cur_logical -
1522 map->m_lblk);
1523 }
1524 index++;
1525
1526 BUG_ON(!PageLocked(page));
1527 BUG_ON(PageWriteback(page));
1528
1529 bh = page_bufs = page_buffers(page);
1530 block_start = 0;
1531 do {
1532 if (map && (cur_logical >= map->m_lblk) &&
1533 (cur_logical <= (map->m_lblk +
1534 (map->m_len - 1)))) {
1535 if (buffer_delay(bh)) {
1536 clear_buffer_delay(bh);
1537 bh->b_blocknr = pblock;
1538 }
1539 if (buffer_unwritten(bh) ||
1540 buffer_mapped(bh))
1541 BUG_ON(bh->b_blocknr != pblock);
1542 if (map->m_flags & EXT4_MAP_UNINIT)
1543 set_buffer_uninit(bh);
1544 clear_buffer_unwritten(bh);
1545 }
1546
1547 /*
1548 * skip page if block allocation undone and
1549 * block is dirty
1550 */
1551 if (ext4_bh_delay_or_unwritten(NULL, bh))
1552 skip_page = 1;
1553 bh = bh->b_this_page;
1554 block_start += bh->b_size;
1555 cur_logical++;
1556 pblock++;
1557 } while (bh != page_bufs);
1558
1559 if (skip_page) {
1560 unlock_page(page);
1561 continue;
1562 }
1563
1564 clear_page_dirty_for_io(page);
1565 err = ext4_bio_write_page(&io_submit, page, len,
1566 mpd->wbc);
1567 if (!err)
1568 mpd->pages_written++;
1569 /*
1570 * In error case, we have to continue because
1571 * remaining pages are still locked
1572 */
1573 if (ret == 0)
1574 ret = err;
1575 }
1576 pagevec_release(&pvec);
1577 }
1578 ext4_io_submit(&io_submit);
1579 return ret;
1580}
1581 1429
1582static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 1430static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1431 bool invalidate)
1583{ 1432{
1584 int nr_pages, i; 1433 int nr_pages, i;
1585 pgoff_t index, end; 1434 pgoff_t index, end;
1586 struct pagevec pvec; 1435 struct pagevec pvec;
1587 struct inode *inode = mpd->inode; 1436 struct inode *inode = mpd->inode;
1588 struct address_space *mapping = inode->i_mapping; 1437 struct address_space *mapping = inode->i_mapping;
1589 ext4_lblk_t start, last; 1438
1439 /* This is necessary when next_page == 0. */
1440 if (mpd->first_page >= mpd->next_page)
1441 return;
1590 1442
1591 index = mpd->first_page; 1443 index = mpd->first_page;
1592 end = mpd->next_page - 1; 1444 end = mpd->next_page - 1;
1593 1445 if (invalidate) {
1594 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1446 ext4_lblk_t start, last;
1595 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1447 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1596 ext4_es_remove_extent(inode, start, last - start + 1); 1448 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1449 ext4_es_remove_extent(inode, start, last - start + 1);
1450 }
1597 1451
1598 pagevec_init(&pvec, 0); 1452 pagevec_init(&pvec, 0);
1599 while (index <= end) { 1453 while (index <= end) {
@@ -1606,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1606 break; 1460 break;
1607 BUG_ON(!PageLocked(page)); 1461 BUG_ON(!PageLocked(page));
1608 BUG_ON(PageWriteback(page)); 1462 BUG_ON(PageWriteback(page));
1609 block_invalidatepage(page, 0); 1463 if (invalidate) {
1610 ClearPageUptodate(page); 1464 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1465 ClearPageUptodate(page);
1466 }
1611 unlock_page(page); 1467 unlock_page(page);
1612 } 1468 }
1613 index = pvec.pages[nr_pages - 1]->index + 1; 1469 index = pvec.pages[nr_pages - 1]->index + 1;
1614 pagevec_release(&pvec); 1470 pagevec_release(&pvec);
1615 } 1471 }
1616 return;
1617} 1472}
1618 1473
1619static void ext4_print_free_blocks(struct inode *inode) 1474static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1642 return; 1497 return;
1643} 1498}
1644 1499
1645/*
1646 * mpage_da_map_and_submit - go through given space, map them
1647 * if necessary, and then submit them for I/O
1648 *
1649 * @mpd - bh describing space
1650 *
1651 * The function skips space we know is already mapped to disk blocks.
1652 *
1653 */
1654static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1655{
1656 int err, blks, get_blocks_flags;
1657 struct ext4_map_blocks map, *mapp = NULL;
1658 sector_t next = mpd->b_blocknr;
1659 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1660 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1661 handle_t *handle = NULL;
1662
1663 /*
1664 * If the blocks are mapped already, or we couldn't accumulate
1665 * any blocks, then proceed immediately to the submission stage.
1666 */
1667 if ((mpd->b_size == 0) ||
1668 ((mpd->b_state & (1 << BH_Mapped)) &&
1669 !(mpd->b_state & (1 << BH_Delay)) &&
1670 !(mpd->b_state & (1 << BH_Unwritten))))
1671 goto submit_io;
1672
1673 handle = ext4_journal_current_handle();
1674 BUG_ON(!handle);
1675
1676 /*
1677 * Call ext4_map_blocks() to allocate any delayed allocation
1678 * blocks, or to convert an uninitialized extent to be
1679 * initialized (in the case where we have written into
1680 * one or more preallocated blocks).
1681 *
1682 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1683 * indicate that we are on the delayed allocation path. This
1684 * affects functions in many different parts of the allocation
1685 * call path. This flag exists primarily because we don't
1686 * want to change *many* call functions, so ext4_map_blocks()
1687 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1688 * inode's allocation semaphore is taken.
1689 *
1690 * If the blocks in questions were delalloc blocks, set
1691 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1692 * variables are updated after the blocks have been allocated.
1693 */
1694 map.m_lblk = next;
1695 map.m_len = max_blocks;
1696 /*
1697 * We're in delalloc path and it is possible that we're going to
1698 * need more metadata blocks than previously reserved. However
1699 * we must not fail because we're in writeback and there is
1700 * nothing we can do about it so it might result in data loss.
1701 * So use reserved blocks to allocate metadata if possible.
1702 */
1703 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1704 EXT4_GET_BLOCKS_METADATA_NOFAIL;
1705 if (ext4_should_dioread_nolock(mpd->inode))
1706 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1707 if (mpd->b_state & (1 << BH_Delay))
1708 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1709
1710
1711 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1712 if (blks < 0) {
1713 struct super_block *sb = mpd->inode->i_sb;
1714
1715 err = blks;
1716 /*
1717 * If get block returns EAGAIN or ENOSPC and there
1718 * appears to be free blocks we will just let
1719 * mpage_da_submit_io() unlock all of the pages.
1720 */
1721 if (err == -EAGAIN)
1722 goto submit_io;
1723
1724 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1725 mpd->retval = err;
1726 goto submit_io;
1727 }
1728
1729 /*
1730 * get block failure will cause us to loop in
1731 * writepages, because a_ops->writepage won't be able
1732 * to make progress. The page will be redirtied by
1733 * writepage and writepages will again try to write
1734 * the same.
1735 */
1736 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1737 ext4_msg(sb, KERN_CRIT,
1738 "delayed block allocation failed for inode %lu "
1739 "at logical offset %llu with max blocks %zd "
1740 "with error %d", mpd->inode->i_ino,
1741 (unsigned long long) next,
1742 mpd->b_size >> mpd->inode->i_blkbits, err);
1743 ext4_msg(sb, KERN_CRIT,
1744 "This should not happen!! Data will be lost");
1745 if (err == -ENOSPC)
1746 ext4_print_free_blocks(mpd->inode);
1747 }
1748 /* invalidate all the pages */
1749 ext4_da_block_invalidatepages(mpd);
1750
1751 /* Mark this page range as having been completed */
1752 mpd->io_done = 1;
1753 return;
1754 }
1755 BUG_ON(blks == 0);
1756
1757 mapp = &map;
1758 if (map.m_flags & EXT4_MAP_NEW) {
1759 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1760 int i;
1761
1762 for (i = 0; i < map.m_len; i++)
1763 unmap_underlying_metadata(bdev, map.m_pblk + i);
1764 }
1765
1766 /*
1767 * Update on-disk size along with block allocation.
1768 */
1769 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1770 if (disksize > i_size_read(mpd->inode))
1771 disksize = i_size_read(mpd->inode);
1772 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1773 ext4_update_i_disksize(mpd->inode, disksize);
1774 err = ext4_mark_inode_dirty(handle, mpd->inode);
1775 if (err)
1776 ext4_error(mpd->inode->i_sb,
1777 "Failed to mark inode %lu dirty",
1778 mpd->inode->i_ino);
1779 }
1780
1781submit_io:
1782 mpage_da_submit_io(mpd, mapp);
1783 mpd->io_done = 1;
1784}
1785
1786#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1787 (1 << BH_Delay) | (1 << BH_Unwritten))
1788
1789/*
1790 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1791 *
1792 * @mpd->lbh - extent of blocks
1793 * @logical - logical number of the block in the file
1794 * @b_state - b_state of the buffer head added
1795 *
1796 * the function is used to collect contig. blocks in same state
1797 */
1798static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
1799 unsigned long b_state)
1800{
1801 sector_t next;
1802 int blkbits = mpd->inode->i_blkbits;
1803 int nrblocks = mpd->b_size >> blkbits;
1804
1805 /*
1806 * XXX Don't go larger than mballoc is willing to allocate
1807 * This is a stopgap solution. We eventually need to fold
1808 * mpage_da_submit_io() into this function and then call
1809 * ext4_map_blocks() multiple times in a loop
1810 */
1811 if (nrblocks >= (8*1024*1024 >> blkbits))
1812 goto flush_it;
1813
1814 /* check if the reserved journal credits might overflow */
1815 if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
1816 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1817 /*
1818 * With non-extent format we are limited by the journal
1819 * credit available. Total credit needed to insert
1820 * nrblocks contiguous blocks is dependent on the
1821 * nrblocks. So limit nrblocks.
1822 */
1823 goto flush_it;
1824 }
1825 }
1826 /*
1827 * First block in the extent
1828 */
1829 if (mpd->b_size == 0) {
1830 mpd->b_blocknr = logical;
1831 mpd->b_size = 1 << blkbits;
1832 mpd->b_state = b_state & BH_FLAGS;
1833 return;
1834 }
1835
1836 next = mpd->b_blocknr + nrblocks;
1837 /*
1838 * Can we merge the block to our big extent?
1839 */
1840 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1841 mpd->b_size += 1 << blkbits;
1842 return;
1843 }
1844
1845flush_it:
1846 /*
1847 * We couldn't merge the block to our extent, so we
1848 * need to flush current extent and start new one
1849 */
1850 mpage_da_map_and_submit(mpd);
1851 return;
1852}
1853
1854static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 1500static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1855{ 1501{
1856 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1502 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1883,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1883 "logical block %lu\n", inode->i_ino, map->m_len, 1529 "logical block %lu\n", inode->i_ino, map->m_len,
1884 (unsigned long) map->m_lblk); 1530 (unsigned long) map->m_lblk);
1885 1531
1532 ext4_es_lru_add(inode);
1533
1886 /* Lookup extent status tree firstly */ 1534 /* Lookup extent status tree firstly */
1887 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1535 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1888 1536
@@ -2156,7 +1804,7 @@ out:
2156 * lock so we have to do some magic. 1804 * lock so we have to do some magic.
2157 * 1805 *
2158 * This function can get called via... 1806 * This function can get called via...
2159 * - ext4_da_writepages after taking page lock (have journal handle) 1807 * - ext4_writepages after taking page lock (have journal handle)
2160 * - journal_submit_inode_data_buffers (no journal handle) 1808 * - journal_submit_inode_data_buffers (no journal handle)
2161 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1809 * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
2162 * - grab_page_cache when doing write_begin (have journal handle) 1810 * - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1882,405 @@ static int ext4_writepage(struct page *page,
2234 */ 1882 */
2235 return __ext4_journalled_writepage(page, len); 1883 return __ext4_journalled_writepage(page, len);
2236 1884
2237 memset(&io_submit, 0, sizeof(io_submit)); 1885 ext4_io_submit_init(&io_submit, wbc);
1886 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
1887 if (!io_submit.io_end) {
1888 redirty_page_for_writepage(wbc, page);
1889 unlock_page(page);
1890 return -ENOMEM;
1891 }
2238 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 1892 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
2239 ext4_io_submit(&io_submit); 1893 ext4_io_submit(&io_submit);
1894 /* Drop io_end reference we got from init */
1895 ext4_put_io_end_defer(io_submit.io_end);
2240 return ret; 1896 return ret;
2241} 1897}
2242 1898
1899#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
1900
2243/* 1901/*
2244 * This is called via ext4_da_writepages() to 1902 * mballoc gives us at most this number of blocks...
2245 * calculate the total number of credits to reserve to fit 1903 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
2246 * a single extent allocation into a single transaction, 1904 * The rest of mballoc seems to handle chunks upto full group size.
2247 * ext4_da_writpeages() will loop calling this before
2248 * the block allocation.
2249 */ 1905 */
1906#define MAX_WRITEPAGES_EXTENT_LEN 2048
2250 1907
2251static int ext4_da_writepages_trans_blocks(struct inode *inode) 1908/*
1909 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1910 *
1911 * @mpd - extent of blocks
1912 * @lblk - logical number of the block in the file
1913 * @b_state - b_state of the buffer head added
1914 *
1915 * the function is used to collect contig. blocks in same state
1916 */
1917static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1918 unsigned long b_state)
1919{
1920 struct ext4_map_blocks *map = &mpd->map;
1921
1922 /* Don't go larger than mballoc is willing to allocate */
1923 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1924 return 0;
1925
1926 /* First block in the extent? */
1927 if (map->m_len == 0) {
1928 map->m_lblk = lblk;
1929 map->m_len = 1;
1930 map->m_flags = b_state & BH_FLAGS;
1931 return 1;
1932 }
1933
1934 /* Can we merge the block to our big extent? */
1935 if (lblk == map->m_lblk + map->m_len &&
1936 (b_state & BH_FLAGS) == map->m_flags) {
1937 map->m_len++;
1938 return 1;
1939 }
1940 return 0;
1941}
1942
1943static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
1944 struct buffer_head *head,
1945 struct buffer_head *bh,
1946 ext4_lblk_t lblk)
1947{
1948 struct inode *inode = mpd->inode;
1949 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
1950 >> inode->i_blkbits;
1951
1952 do {
1953 BUG_ON(buffer_locked(bh));
1954
1955 if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1956 (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
1957 lblk >= blocks) {
1958 /* Found extent to map? */
1959 if (mpd->map.m_len)
1960 return false;
1961 if (lblk >= blocks)
1962 return true;
1963 continue;
1964 }
1965 if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
1966 return false;
1967 } while (lblk++, (bh = bh->b_this_page) != head);
1968 return true;
1969}
1970
1971static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
2252{ 1972{
2253 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1973 int len;
1974 loff_t size = i_size_read(mpd->inode);
1975 int err;
1976
1977 BUG_ON(page->index != mpd->first_page);
1978 if (page->index == size >> PAGE_CACHE_SHIFT)
1979 len = size & ~PAGE_CACHE_MASK;
1980 else
1981 len = PAGE_CACHE_SIZE;
1982 clear_page_dirty_for_io(page);
1983 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
1984 if (!err)
1985 mpd->wbc->nr_to_write--;
1986 mpd->first_page++;
2254 1987
1988 return err;
1989}
1990
1991/*
1992 * mpage_map_buffers - update buffers corresponding to changed extent and
1993 * submit fully mapped pages for IO
1994 *
1995 * @mpd - description of extent to map, on return next extent to map
1996 *
1997 * Scan buffers corresponding to changed extent (we expect corresponding pages
1998 * to be already locked) and update buffer state according to new extent state.
1999 * We map delalloc buffers to their physical location, clear unwritten bits,
2000 * and mark buffers as uninit when we perform writes to uninitialized extents
2001 * and do extent conversion after IO is finished. If the last page is not fully
2002 * mapped, we update @map to the next extent in the last page that needs
2003 * mapping. Otherwise we submit the page for IO.
2004 */
2005static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2006{
2007 struct pagevec pvec;
2008 int nr_pages, i;
2009 struct inode *inode = mpd->inode;
2010 struct buffer_head *head, *bh;
2011 int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
2012 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
2013 >> inode->i_blkbits;
2014 pgoff_t start, end;
2015 ext4_lblk_t lblk;
2016 sector_t pblock;
2017 int err;
2018
2019 start = mpd->map.m_lblk >> bpp_bits;
2020 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2021 lblk = start << bpp_bits;
2022 pblock = mpd->map.m_pblk;
2023
2024 pagevec_init(&pvec, 0);
2025 while (start <= end) {
2026 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
2027 PAGEVEC_SIZE);
2028 if (nr_pages == 0)
2029 break;
2030 for (i = 0; i < nr_pages; i++) {
2031 struct page *page = pvec.pages[i];
2032
2033 if (page->index > end)
2034 break;
2035 /* Upto 'end' pages must be contiguous */
2036 BUG_ON(page->index != start);
2037 bh = head = page_buffers(page);
2038 do {
2039 if (lblk < mpd->map.m_lblk)
2040 continue;
2041 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2042 /*
2043 * Buffer after end of mapped extent.
2044 * Find next buffer in the page to map.
2045 */
2046 mpd->map.m_len = 0;
2047 mpd->map.m_flags = 0;
2048 add_page_bufs_to_extent(mpd, head, bh,
2049 lblk);
2050 pagevec_release(&pvec);
2051 return 0;
2052 }
2053 if (buffer_delay(bh)) {
2054 clear_buffer_delay(bh);
2055 bh->b_blocknr = pblock++;
2056 }
2057 clear_buffer_unwritten(bh);
2058 } while (++lblk < blocks &&
2059 (bh = bh->b_this_page) != head);
2060
2061 /*
2062 * FIXME: This is going to break if dioread_nolock
2063 * supports blocksize < pagesize as we will try to
2064 * convert potentially unmapped parts of inode.
2065 */
2066 mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
2067 /* Page fully mapped - let IO run! */
2068 err = mpage_submit_page(mpd, page);
2069 if (err < 0) {
2070 pagevec_release(&pvec);
2071 return err;
2072 }
2073 start++;
2074 }
2075 pagevec_release(&pvec);
2076 }
2077 /* Extent fully mapped and matches with page boundary. We are done. */
2078 mpd->map.m_len = 0;
2079 mpd->map.m_flags = 0;
2080 return 0;
2081}
2082
2083static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2084{
2085 struct inode *inode = mpd->inode;
2086 struct ext4_map_blocks *map = &mpd->map;
2087 int get_blocks_flags;
2088 int err;
2089
2090 trace_ext4_da_write_pages_extent(inode, map);
2255 /* 2091 /*
2256 * With non-extent format the journal credit needed to 2092 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2257 * insert nrblocks contiguous block is dependent on 2093 * to convert an uninitialized extent to be initialized (in the case
2258 * number of contiguous block. So we will limit 2094 * where we have written into one or more preallocated blocks). It is
2259 * number of contiguous block to a sane value 2095 * possible that we're going to need more metadata blocks than
2096 * previously reserved. However we must not fail because we're in
2097 * writeback and there is nothing we can do about it so it might result
2098 * in data loss. So use reserved blocks to allocate metadata if
2099 * possible.
2100 *
2101 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
2102 * in question are delalloc blocks. This affects functions in many
2103 * different parts of the allocation call path. This flag exists
2104 * primarily because we don't want to change *many* call functions, so
2105 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
2106 * once the inode's allocation semaphore is taken.
2260 */ 2107 */
2261 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && 2108 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2262 (max_blocks > EXT4_MAX_TRANS_DATA)) 2109 EXT4_GET_BLOCKS_METADATA_NOFAIL;
2263 max_blocks = EXT4_MAX_TRANS_DATA; 2110 if (ext4_should_dioread_nolock(inode))
2111 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2112 if (map->m_flags & (1 << BH_Delay))
2113 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2264 2114
2265 return ext4_chunk_trans_blocks(inode, max_blocks); 2115 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2116 if (err < 0)
2117 return err;
2118 if (map->m_flags & EXT4_MAP_UNINIT) {
2119 if (!mpd->io_submit.io_end->handle &&
2120 ext4_handle_valid(handle)) {
2121 mpd->io_submit.io_end->handle = handle->h_rsv_handle;
2122 handle->h_rsv_handle = NULL;
2123 }
2124 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
2125 }
2126
2127 BUG_ON(map->m_len == 0);
2128 if (map->m_flags & EXT4_MAP_NEW) {
2129 struct block_device *bdev = inode->i_sb->s_bdev;
2130 int i;
2131
2132 for (i = 0; i < map->m_len; i++)
2133 unmap_underlying_metadata(bdev, map->m_pblk + i);
2134 }
2135 return 0;
2266} 2136}
2267 2137
2268/* 2138/*
2269 * write_cache_pages_da - walk the list of dirty pages of the given 2139 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2270 * address space and accumulate pages that need writing, and call 2140 * mpd->len and submit pages underlying it for IO
2271 * mpage_da_map_and_submit to map a single contiguous memory region 2141 *
2272 * and then write them. 2142 * @handle - handle for journal operations
2143 * @mpd - extent to map
2144 *
2145 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2146 * delayed, blocks are allocated, if it is unwritten, we may need to convert
2147 * them to initialized or split the described range from larger unwritten
2148 * extent. Note that we need not map all the described range since allocation
2149 * can return less blocks or the range is covered by more unwritten extents. We
2150 * cannot map more because we are limited by reserved transaction credits. On
2151 * the other hand we always make sure that the last touched page is fully
2152 * mapped so that it can be written out (and thus forward progress is
2153 * guaranteed). After mapping we submit all mapped pages for IO.
2273 */ 2154 */
2274static int write_cache_pages_da(handle_t *handle, 2155static int mpage_map_and_submit_extent(handle_t *handle,
2275 struct address_space *mapping, 2156 struct mpage_da_data *mpd,
2276 struct writeback_control *wbc, 2157 bool *give_up_on_write)
2277 struct mpage_da_data *mpd,
2278 pgoff_t *done_index)
2279{ 2158{
2280 struct buffer_head *bh, *head; 2159 struct inode *inode = mpd->inode;
2281 struct inode *inode = mapping->host; 2160 struct ext4_map_blocks *map = &mpd->map;
2282 struct pagevec pvec; 2161 int err;
2283 unsigned int nr_pages; 2162 loff_t disksize;
2284 sector_t logical;
2285 pgoff_t index, end;
2286 long nr_to_write = wbc->nr_to_write;
2287 int i, tag, ret = 0;
2288
2289 memset(mpd, 0, sizeof(struct mpage_da_data));
2290 mpd->wbc = wbc;
2291 mpd->inode = inode;
2292 pagevec_init(&pvec, 0);
2293 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2294 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2295 2163
2296 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2164 mpd->io_submit.io_end->offset =
2165 ((loff_t)map->m_lblk) << inode->i_blkbits;
2166 while (map->m_len) {
2167 err = mpage_map_one_extent(handle, mpd);
2168 if (err < 0) {
2169 struct super_block *sb = inode->i_sb;
2170
2171 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
2172 goto invalidate_dirty_pages;
2173 /*
2174 * Let the uper layers retry transient errors.
2175 * In the case of ENOSPC, if ext4_count_free_blocks()
2176 * is non-zero, a commit should free up blocks.
2177 */
2178 if ((err == -ENOMEM) ||
2179 (err == -ENOSPC && ext4_count_free_clusters(sb)))
2180 return err;
2181 ext4_msg(sb, KERN_CRIT,
2182 "Delayed block allocation failed for "
2183 "inode %lu at logical offset %llu with"
2184 " max blocks %u with error %d",
2185 inode->i_ino,
2186 (unsigned long long)map->m_lblk,
2187 (unsigned)map->m_len, -err);
2188 ext4_msg(sb, KERN_CRIT,
2189 "This should not happen!! Data will "
2190 "be lost\n");
2191 if (err == -ENOSPC)
2192 ext4_print_free_blocks(inode);
2193 invalidate_dirty_pages:
2194 *give_up_on_write = true;
2195 return err;
2196 }
2197 /*
2198 * Update buffer state, submit mapped pages, and get us new
2199 * extent to map
2200 */
2201 err = mpage_map_and_submit_buffers(mpd);
2202 if (err < 0)
2203 return err;
2204 }
2205
2206 /* Update on-disk size after IO is submitted */
2207 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
2208 if (disksize > i_size_read(inode))
2209 disksize = i_size_read(inode);
2210 if (disksize > EXT4_I(inode)->i_disksize) {
2211 int err2;
2212
2213 ext4_update_i_disksize(inode, disksize);
2214 err2 = ext4_mark_inode_dirty(handle, inode);
2215 if (err2)
2216 ext4_error(inode->i_sb,
2217 "Failed to mark inode %lu dirty",
2218 inode->i_ino);
2219 if (!err)
2220 err = err2;
2221 }
2222 return err;
2223}
2224
2225/*
2226 * Calculate the total number of credits to reserve for one writepages
2227 * iteration. This is called from ext4_writepages(). We map an extent of
2228 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2229 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2230 * bpp - 1 blocks in bpp different extents.
2231 */
2232static int ext4_da_writepages_trans_blocks(struct inode *inode)
2233{
2234 int bpp = ext4_journal_blocks_per_page(inode);
2235
2236 return ext4_meta_trans_blocks(inode,
2237 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
2238}
2239
2240/*
2241 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2242 * and underlying extent to map
2243 *
2244 * @mpd - where to look for pages
2245 *
2246 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2247 * IO immediately. When we find a page which isn't mapped we start accumulating
2248 * extent of buffers underlying these pages that needs mapping (formed by
2249 * either delayed or unwritten buffers). We also lock the pages containing
2250 * these buffers. The extent found is returned in @mpd structure (starting at
2251 * mpd->lblk with length mpd->len blocks).
2252 *
2253 * Note that this function can attach bios to one io_end structure which are
2254 * neither logically nor physically contiguous. Although it may seem as an
2255 * unnecessary complication, it is actually inevitable in blocksize < pagesize
2256 * case as we need to track IO to all buffers underlying a page in one io_end.
2257 */
2258static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2259{
2260 struct address_space *mapping = mpd->inode->i_mapping;
2261 struct pagevec pvec;
2262 unsigned int nr_pages;
2263 pgoff_t index = mpd->first_page;
2264 pgoff_t end = mpd->last_page;
2265 int tag;
2266 int i, err = 0;
2267 int blkbits = mpd->inode->i_blkbits;
2268 ext4_lblk_t lblk;
2269 struct buffer_head *head;
2270
2271 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2297 tag = PAGECACHE_TAG_TOWRITE; 2272 tag = PAGECACHE_TAG_TOWRITE;
2298 else 2273 else
2299 tag = PAGECACHE_TAG_DIRTY; 2274 tag = PAGECACHE_TAG_DIRTY;
2300 2275
2301 *done_index = index; 2276 pagevec_init(&pvec, 0);
2277 mpd->map.m_len = 0;
2278 mpd->next_page = index;
2302 while (index <= end) { 2279 while (index <= end) {
2303 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2280 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2304 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2281 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2305 if (nr_pages == 0) 2282 if (nr_pages == 0)
2306 return 0; 2283 goto out;
2307 2284
2308 for (i = 0; i < nr_pages; i++) { 2285 for (i = 0; i < nr_pages; i++) {
2309 struct page *page = pvec.pages[i]; 2286 struct page *page = pvec.pages[i];
@@ -2318,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle,
2318 if (page->index > end) 2295 if (page->index > end)
2319 goto out; 2296 goto out;
2320 2297
2321 *done_index = page->index + 1; 2298 /* If we can't merge this page, we are done. */
2322 2299 if (mpd->map.m_len > 0 && mpd->next_page != page->index)
2323 /* 2300 goto out;
2324 * If we can't merge this page, and we have
2325 * accumulated an contiguous region, write it
2326 */
2327 if ((mpd->next_page != page->index) &&
2328 (mpd->next_page != mpd->first_page)) {
2329 mpage_da_map_and_submit(mpd);
2330 goto ret_extent_tail;
2331 }
2332 2301
2333 lock_page(page); 2302 lock_page(page);
2334
2335 /* 2303 /*
2336 * If the page is no longer dirty, or its 2304 * If the page is no longer dirty, or its mapping no
2337 * mapping no longer corresponds to inode we 2305 * longer corresponds to inode we are writing (which
2338 * are writing (which means it has been 2306 * means it has been truncated or invalidated), or the
2339 * truncated or invalidated), or the page is 2307 * page is already under writeback and we are not doing
2340 * already under writeback and we are not 2308 * a data integrity writeback, skip the page
2341 * doing a data integrity writeback, skip the page
2342 */ 2309 */
2343 if (!PageDirty(page) || 2310 if (!PageDirty(page) ||
2344 (PageWriteback(page) && 2311 (PageWriteback(page) &&
2345 (wbc->sync_mode == WB_SYNC_NONE)) || 2312 (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2346 unlikely(page->mapping != mapping)) { 2313 unlikely(page->mapping != mapping)) {
2347 unlock_page(page); 2314 unlock_page(page);
2348 continue; 2315 continue;
@@ -2351,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle,
2351 wait_on_page_writeback(page); 2318 wait_on_page_writeback(page);
2352 BUG_ON(PageWriteback(page)); 2319 BUG_ON(PageWriteback(page));
2353 2320
2354 /* 2321 if (mpd->map.m_len == 0)
2355 * If we have inline data and arrive here, it means that
2356 * we will soon create the block for the 1st page, so
2357 * we'd better clear the inline data here.
2358 */
2359 if (ext4_has_inline_data(inode)) {
2360 BUG_ON(ext4_test_inode_state(inode,
2361 EXT4_STATE_MAY_INLINE_DATA));
2362 ext4_destroy_inline_data(handle, inode);
2363 }
2364
2365 if (mpd->next_page != page->index)
2366 mpd->first_page = page->index; 2322 mpd->first_page = page->index;
2367 mpd->next_page = page->index + 1; 2323 mpd->next_page = page->index + 1;
2368 logical = (sector_t) page->index <<
2369 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2370
2371 /* Add all dirty buffers to mpd */ 2324 /* Add all dirty buffers to mpd */
2325 lblk = ((ext4_lblk_t)page->index) <<
2326 (PAGE_CACHE_SHIFT - blkbits);
2372 head = page_buffers(page); 2327 head = page_buffers(page);
2373 bh = head; 2328 if (!add_page_bufs_to_extent(mpd, head, head, lblk))
2374 do { 2329 goto out;
2375 BUG_ON(buffer_locked(bh)); 2330 /* So far everything mapped? Submit the page for IO. */
2376 /* 2331 if (mpd->map.m_len == 0) {
2377 * We need to try to allocate unmapped blocks 2332 err = mpage_submit_page(mpd, page);
2378 * in the same page. Otherwise we won't make 2333 if (err < 0)
2379 * progress with the page in ext4_writepage
2380 */
2381 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2382 mpage_add_bh_to_extent(mpd, logical,
2383 bh->b_state);
2384 if (mpd->io_done)
2385 goto ret_extent_tail;
2386 } else if (buffer_dirty(bh) &&
2387 buffer_mapped(bh)) {
2388 /*
2389 * mapped dirty buffer. We need to
2390 * update the b_state because we look
2391 * at b_state in mpage_da_map_blocks.
2392 * We don't update b_size because if we
2393 * find an unmapped buffer_head later
2394 * we need to use the b_state flag of
2395 * that buffer_head.
2396 */
2397 if (mpd->b_size == 0)
2398 mpd->b_state =
2399 bh->b_state & BH_FLAGS;
2400 }
2401 logical++;
2402 } while ((bh = bh->b_this_page) != head);
2403
2404 if (nr_to_write > 0) {
2405 nr_to_write--;
2406 if (nr_to_write == 0 &&
2407 wbc->sync_mode == WB_SYNC_NONE)
2408 /*
2409 * We stop writing back only if we are
2410 * not doing integrity sync. In case of
2411 * integrity sync we have to keep going
2412 * because someone may be concurrently
2413 * dirtying pages, and we might have
2414 * synced a lot of newly appeared dirty
2415 * pages, but have not synced all of the
2416 * old dirty pages.
2417 */
2418 goto out; 2334 goto out;
2419 } 2335 }
2336
2337 /*
2338 * Accumulated enough dirty pages? This doesn't apply
2339 * to WB_SYNC_ALL mode. For integrity sync we have to
2340 * keep going because someone may be concurrently
2341 * dirtying pages, and we might have synced a lot of
2342 * newly appeared dirty pages, but have not synced all
2343 * of the old dirty pages.
2344 */
2345 if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
2346 mpd->next_page - mpd->first_page >=
2347 mpd->wbc->nr_to_write)
2348 goto out;
2420 } 2349 }
2421 pagevec_release(&pvec); 2350 pagevec_release(&pvec);
2422 cond_resched(); 2351 cond_resched();
2423 } 2352 }
2424 return 0; 2353 return 0;
2425ret_extent_tail:
2426 ret = MPAGE_DA_EXTENT_TAIL;
2427out: 2354out:
2428 pagevec_release(&pvec); 2355 pagevec_release(&pvec);
2429 cond_resched(); 2356 return err;
2430 return ret;
2431} 2357}
2432 2358
2359static int __writepage(struct page *page, struct writeback_control *wbc,
2360 void *data)
2361{
2362 struct address_space *mapping = data;
2363 int ret = ext4_writepage(page, wbc);
2364 mapping_set_error(mapping, ret);
2365 return ret;
2366}
2433 2367
2434static int ext4_da_writepages(struct address_space *mapping, 2368static int ext4_writepages(struct address_space *mapping,
2435 struct writeback_control *wbc) 2369 struct writeback_control *wbc)
2436{ 2370{
2437 pgoff_t index; 2371 pgoff_t writeback_index = 0;
2372 long nr_to_write = wbc->nr_to_write;
2438 int range_whole = 0; 2373 int range_whole = 0;
2374 int cycled = 1;
2439 handle_t *handle = NULL; 2375 handle_t *handle = NULL;
2440 struct mpage_da_data mpd; 2376 struct mpage_da_data mpd;
2441 struct inode *inode = mapping->host; 2377 struct inode *inode = mapping->host;
2442 int pages_written = 0; 2378 int needed_blocks, rsv_blocks = 0, ret = 0;
2443 unsigned int max_pages;
2444 int range_cyclic, cycled = 1, io_done = 0;
2445 int needed_blocks, ret = 0;
2446 long desired_nr_to_write, nr_to_writebump = 0;
2447 loff_t range_start = wbc->range_start;
2448 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2379 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2449 pgoff_t done_index = 0; 2380 bool done;
2450 pgoff_t end;
2451 struct blk_plug plug; 2381 struct blk_plug plug;
2382 bool give_up_on_write = false;
2452 2383
2453 trace_ext4_da_writepages(inode, wbc); 2384 trace_ext4_writepages(inode, wbc);
2454 2385
2455 /* 2386 /*
2456 * No pages to write? This is mainly a kludge to avoid starting 2387 * No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping,
2460 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2391 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2461 return 0; 2392 return 0;
2462 2393
2394 if (ext4_should_journal_data(inode)) {
2395 struct blk_plug plug;
2396 int ret;
2397
2398 blk_start_plug(&plug);
2399 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
2400 blk_finish_plug(&plug);
2401 return ret;
2402 }
2403
2463 /* 2404 /*
2464 * If the filesystem has aborted, it is read-only, so return 2405 * If the filesystem has aborted, it is read-only, so return
2465 * right away instead of dumping stack traces later on that 2406 * right away instead of dumping stack traces later on that
2466 * will obscure the real source of the problem. We test 2407 * will obscure the real source of the problem. We test
2467 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2408 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2468 * the latter could be true if the filesystem is mounted 2409 * the latter could be true if the filesystem is mounted
2469 * read-only, and in that case, ext4_da_writepages should 2410 * read-only, and in that case, ext4_writepages should
2470 * *never* be called, so if that ever happens, we would want 2411 * *never* be called, so if that ever happens, we would want
2471 * the stack trace. 2412 * the stack trace.
2472 */ 2413 */
2473 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2414 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2474 return -EROFS; 2415 return -EROFS;
2475 2416
2417 if (ext4_should_dioread_nolock(inode)) {
2418 /*
2419 * We may need to convert upto one extent per block in
2420 * the page and we may dirty the inode.
2421 */
2422 rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
2423 }
2424
2425 /*
2426 * If we have inline data and arrive here, it means that
2427 * we will soon create the block for the 1st page, so
2428 * we'd better clear the inline data here.
2429 */
2430 if (ext4_has_inline_data(inode)) {
2431 /* Just inode will be modified... */
2432 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2433 if (IS_ERR(handle)) {
2434 ret = PTR_ERR(handle);
2435 goto out_writepages;
2436 }
2437 BUG_ON(ext4_test_inode_state(inode,
2438 EXT4_STATE_MAY_INLINE_DATA));
2439 ext4_destroy_inline_data(handle, inode);
2440 ext4_journal_stop(handle);
2441 }
2442
2476 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2443 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2477 range_whole = 1; 2444 range_whole = 1;
2478 2445
2479 range_cyclic = wbc->range_cyclic;
2480 if (wbc->range_cyclic) { 2446 if (wbc->range_cyclic) {
2481 index = mapping->writeback_index; 2447 writeback_index = mapping->writeback_index;
2482 if (index) 2448 if (writeback_index)
2483 cycled = 0; 2449 cycled = 0;
2484 wbc->range_start = index << PAGE_CACHE_SHIFT; 2450 mpd.first_page = writeback_index;
2485 wbc->range_end = LLONG_MAX; 2451 mpd.last_page = -1;
2486 wbc->range_cyclic = 0;
2487 end = -1;
2488 } else { 2452 } else {
2489 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2453 mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
2490 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2454 mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
2491 }
2492
2493 /*
2494 * This works around two forms of stupidity. The first is in
2495 * the writeback code, which caps the maximum number of pages
2496 * written to be 1024 pages. This is wrong on multiple
2497 * levels; different architectues have a different page size,
2498 * which changes the maximum amount of data which gets
2499 * written. Secondly, 4 megabytes is way too small. XFS
2500 * forces this value to be 16 megabytes by multiplying
2501 * nr_to_write parameter by four, and then relies on its
2502 * allocator to allocate larger extents to make them
2503 * contiguous. Unfortunately this brings us to the second
2504 * stupidity, which is that ext4's mballoc code only allocates
2505 * at most 2048 blocks. So we force contiguous writes up to
2506 * the number of dirty blocks in the inode, or
2507 * sbi->max_writeback_mb_bump whichever is smaller.
2508 */
2509 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2510 if (!range_cyclic && range_whole) {
2511 if (wbc->nr_to_write == LONG_MAX)
2512 desired_nr_to_write = wbc->nr_to_write;
2513 else
2514 desired_nr_to_write = wbc->nr_to_write * 8;
2515 } else
2516 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2517 max_pages);
2518 if (desired_nr_to_write > max_pages)
2519 desired_nr_to_write = max_pages;
2520
2521 if (wbc->nr_to_write < desired_nr_to_write) {
2522 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2523 wbc->nr_to_write = desired_nr_to_write;
2524 } 2455 }
2525 2456
2457 mpd.inode = inode;
2458 mpd.wbc = wbc;
2459 ext4_io_submit_init(&mpd.io_submit, wbc);
2526retry: 2460retry:
2527 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2461 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2528 tag_pages_for_writeback(mapping, index, end); 2462 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2529 2463 done = false;
2530 blk_start_plug(&plug); 2464 blk_start_plug(&plug);
2531 while (!ret && wbc->nr_to_write > 0) { 2465 while (!done && mpd.first_page <= mpd.last_page) {
2466 /* For each extent of pages we use new io_end */
2467 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2468 if (!mpd.io_submit.io_end) {
2469 ret = -ENOMEM;
2470 break;
2471 }
2532 2472
2533 /* 2473 /*
2534 * we insert one extent at a time. So we need 2474 * We have two constraints: We find one extent to map and we
2535 * credit needed for single extent allocation. 2475 * must always write out whole page (makes a difference when
2536 * journalled mode is currently not supported 2476 * blocksize < pagesize) so that we don't block on IO when we
2537 * by delalloc 2477 * try to write out the rest of the page. Journalled mode is
2478 * not supported by delalloc.
2538 */ 2479 */
2539 BUG_ON(ext4_should_journal_data(inode)); 2480 BUG_ON(ext4_should_journal_data(inode));
2540 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2481 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2541 2482
2542 /* start a new transaction*/ 2483 /* start a new transaction */
2543 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2484 handle = ext4_journal_start_with_reserve(inode,
2544 needed_blocks); 2485 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
2545 if (IS_ERR(handle)) { 2486 if (IS_ERR(handle)) {
2546 ret = PTR_ERR(handle); 2487 ret = PTR_ERR(handle);
2547 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2488 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2548 "%ld pages, ino %lu; err %d", __func__, 2489 "%ld pages, ino %lu; err %d", __func__,
2549 wbc->nr_to_write, inode->i_ino, ret); 2490 wbc->nr_to_write, inode->i_ino, ret);
2550 blk_finish_plug(&plug); 2491 /* Release allocated io_end */
2551 goto out_writepages; 2492 ext4_put_io_end(mpd.io_submit.io_end);
2493 break;
2552 } 2494 }
2553 2495
2554 /* 2496 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
2555 * Now call write_cache_pages_da() to find the next 2497 ret = mpage_prepare_extent_to_map(&mpd);
2556 * contiguous region of logical blocks that need 2498 if (!ret) {
2557 * blocks to be allocated by ext4 and submit them. 2499 if (mpd.map.m_len)
2558 */ 2500 ret = mpage_map_and_submit_extent(handle, &mpd,
2559 ret = write_cache_pages_da(handle, mapping, 2501 &give_up_on_write);
2560 wbc, &mpd, &done_index); 2502 else {
2561 /* 2503 /*
2562 * If we have a contiguous extent of pages and we 2504 * We scanned the whole range (or exhausted
2563 * haven't done the I/O yet, map the blocks and submit 2505 * nr_to_write), submitted what was mapped and
2564 * them for I/O. 2506 * didn't find anything needing mapping. We are
2565 */ 2507 * done.
2566 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2508 */
2567 mpage_da_map_and_submit(&mpd); 2509 done = true;
2568 ret = MPAGE_DA_EXTENT_TAIL; 2510 }
2569 } 2511 }
2570 trace_ext4_da_write_pages(inode, &mpd);
2571 wbc->nr_to_write -= mpd.pages_written;
2572
2573 ext4_journal_stop(handle); 2512 ext4_journal_stop(handle);
2574 2513 /* Submit prepared bio */
2575 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2514 ext4_io_submit(&mpd.io_submit);
2576 /* commit the transaction which would 2515 /* Unlock pages we didn't use */
2516 mpage_release_unused_pages(&mpd, give_up_on_write);
2517 /* Drop our io_end reference we got from init */
2518 ext4_put_io_end(mpd.io_submit.io_end);
2519
2520 if (ret == -ENOSPC && sbi->s_journal) {
2521 /*
2522 * Commit the transaction which would
2577 * free blocks released in the transaction 2523 * free blocks released in the transaction
2578 * and try again 2524 * and try again
2579 */ 2525 */
2580 jbd2_journal_force_commit_nested(sbi->s_journal); 2526 jbd2_journal_force_commit_nested(sbi->s_journal);
2581 ret = 0; 2527 ret = 0;
2582 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2528 continue;
2583 /* 2529 }
2584 * Got one extent now try with rest of the pages. 2530 /* Fatal error - ENOMEM, EIO... */
2585 * If mpd.retval is set -EIO, journal is aborted. 2531 if (ret)
2586 * So we don't need to write any more.
2587 */
2588 pages_written += mpd.pages_written;
2589 ret = mpd.retval;
2590 io_done = 1;
2591 } else if (wbc->nr_to_write)
2592 /*
2593 * There is no more writeout needed
2594 * or we requested for a noblocking writeout
2595 * and we found the device congested
2596 */
2597 break; 2532 break;
2598 } 2533 }
2599 blk_finish_plug(&plug); 2534 blk_finish_plug(&plug);
2600 if (!io_done && !cycled) { 2535 if (!ret && !cycled) {
2601 cycled = 1; 2536 cycled = 1;
2602 index = 0; 2537 mpd.last_page = writeback_index - 1;
2603 wbc->range_start = index << PAGE_CACHE_SHIFT; 2538 mpd.first_page = 0;
2604 wbc->range_end = mapping->writeback_index - 1;
2605 goto retry; 2539 goto retry;
2606 } 2540 }
2607 2541
2608 /* Update index */ 2542 /* Update index */
2609 wbc->range_cyclic = range_cyclic;
2610 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2543 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2611 /* 2544 /*
2612 * set the writeback_index so that range_cyclic 2545 * Set the writeback_index so that range_cyclic
2613 * mode will write it back later 2546 * mode will write it back later
2614 */ 2547 */
2615 mapping->writeback_index = done_index; 2548 mapping->writeback_index = mpd.first_page;
2616 2549
2617out_writepages: 2550out_writepages:
2618 wbc->nr_to_write -= nr_to_writebump; 2551 trace_ext4_writepages_result(inode, wbc, ret,
2619 wbc->range_start = range_start; 2552 nr_to_write - wbc->nr_to_write);
2620 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2621 return ret; 2553 return ret;
2622} 2554}
2623 2555
@@ -2829,7 +2761,8 @@ static int ext4_da_write_end(struct file *file,
2829 return ret ? ret : copied; 2761 return ret ? ret : copied;
2830} 2762}
2831 2763
2832static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2764static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
2765 unsigned int length)
2833{ 2766{
2834 /* 2767 /*
2835 * Drop reserved blocks 2768 * Drop reserved blocks
@@ -2838,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2838 if (!page_has_buffers(page)) 2771 if (!page_has_buffers(page))
2839 goto out; 2772 goto out;
2840 2773
2841 ext4_da_page_release_reservation(page, offset); 2774 ext4_da_page_release_reservation(page, offset, length);
2842 2775
2843out: 2776out:
2844 ext4_invalidatepage(page, offset); 2777 ext4_invalidatepage(page, offset, length);
2845 2778
2846 return; 2779 return;
2847} 2780}
@@ -2864,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
2864 * laptop_mode, not even desirable). However, to do otherwise 2797 * laptop_mode, not even desirable). However, to do otherwise
2865 * would require replicating code paths in: 2798 * would require replicating code paths in:
2866 * 2799 *
2867 * ext4_da_writepages() -> 2800 * ext4_writepages() ->
2868 * write_cache_pages() ---> (via passed in callback function) 2801 * write_cache_pages() ---> (via passed in callback function)
2869 * __mpage_da_writepage() --> 2802 * __mpage_da_writepage() -->
2870 * mpage_add_bh_to_extent() 2803 * mpage_add_bh_to_extent()
@@ -2989,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
2989 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2922 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2990} 2923}
2991 2924
2992static void ext4_invalidatepage(struct page *page, unsigned long offset) 2925static void ext4_invalidatepage(struct page *page, unsigned int offset,
2926 unsigned int length)
2993{ 2927{
2994 trace_ext4_invalidatepage(page, offset); 2928 trace_ext4_invalidatepage(page, offset, length);
2995 2929
2996 /* No journalling happens on data buffers when this function is used */ 2930 /* No journalling happens on data buffers when this function is used */
2997 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 2931 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
2998 2932
2999 block_invalidatepage(page, offset); 2933 block_invalidatepage(page, offset, length);
3000} 2934}
3001 2935
3002static int __ext4_journalled_invalidatepage(struct page *page, 2936static int __ext4_journalled_invalidatepage(struct page *page,
3003 unsigned long offset) 2937 unsigned int offset,
2938 unsigned int length)
3004{ 2939{
3005 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2940 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3006 2941
3007 trace_ext4_journalled_invalidatepage(page, offset); 2942 trace_ext4_journalled_invalidatepage(page, offset, length);
3008 2943
3009 /* 2944 /*
3010 * If it's a full truncate we just forget about the pending dirtying 2945 * If it's a full truncate we just forget about the pending dirtying
3011 */ 2946 */
3012 if (offset == 0) 2947 if (offset == 0 && length == PAGE_CACHE_SIZE)
3013 ClearPageChecked(page); 2948 ClearPageChecked(page);
3014 2949
3015 return jbd2_journal_invalidatepage(journal, page, offset); 2950 return jbd2_journal_invalidatepage(journal, page, offset, length);
3016} 2951}
3017 2952
3018/* Wrapper for aops... */ 2953/* Wrapper for aops... */
3019static void ext4_journalled_invalidatepage(struct page *page, 2954static void ext4_journalled_invalidatepage(struct page *page,
3020 unsigned long offset) 2955 unsigned int offset,
2956 unsigned int length)
3021{ 2957{
3022 WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); 2958 WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
3023} 2959}
3024 2960
3025static int ext4_releasepage(struct page *page, gfp_t wait) 2961static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +3003,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3067 struct inode *inode = file_inode(iocb->ki_filp); 3003 struct inode *inode = file_inode(iocb->ki_filp);
3068 ext4_io_end_t *io_end = iocb->private; 3004 ext4_io_end_t *io_end = iocb->private;
3069 3005
3070 /* if not async direct IO or dio with 0 bytes write, just return */ 3006 /* if not async direct IO just return */
3071 if (!io_end || !size) 3007 if (!io_end) {
3072 goto out; 3008 inode_dio_done(inode);
3009 if (is_async)
3010 aio_complete(iocb, ret, 0);
3011 return;
3012 }
3073 3013
3074 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3014 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3075 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3015 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3077 size); 3017 size);
3078 3018
3079 iocb->private = NULL; 3019 iocb->private = NULL;
3080
3081 /* if not aio dio with unwritten extents, just free io and return */
3082 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3083 ext4_free_io_end(io_end);
3084out:
3085 inode_dio_done(inode);
3086 if (is_async)
3087 aio_complete(iocb, ret, 0);
3088 return;
3089 }
3090
3091 io_end->offset = offset; 3020 io_end->offset = offset;
3092 io_end->size = size; 3021 io_end->size = size;
3093 if (is_async) { 3022 if (is_async) {
3094 io_end->iocb = iocb; 3023 io_end->iocb = iocb;
3095 io_end->result = ret; 3024 io_end->result = ret;
3096 } 3025 }
3097 3026 ext4_put_io_end_defer(io_end);
3098 ext4_add_complete_io(io_end);
3099} 3027}
3100 3028
3101/* 3029/*
@@ -3129,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3129 get_block_t *get_block_func = NULL; 3057 get_block_t *get_block_func = NULL;
3130 int dio_flags = 0; 3058 int dio_flags = 0;
3131 loff_t final_size = offset + count; 3059 loff_t final_size = offset + count;
3060 ext4_io_end_t *io_end = NULL;
3132 3061
3133 /* Use the old path for reads and writes beyond i_size. */ 3062 /* Use the old path for reads and writes beyond i_size. */
3134 if (rw != WRITE || final_size > inode->i_size) 3063 if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3136 3065
3137 BUG_ON(iocb->private == NULL); 3066 BUG_ON(iocb->private == NULL);
3138 3067
3068 /*
3069 * Make all waiters for direct IO properly wait also for extent
3070 * conversion. This also disallows race between truncate() and
3071 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3072 */
3073 if (rw == WRITE)
3074 atomic_inc(&inode->i_dio_count);
3075
3139 /* If we do a overwrite dio, i_mutex locking can be released */ 3076 /* If we do a overwrite dio, i_mutex locking can be released */
3140 overwrite = *((int *)iocb->private); 3077 overwrite = *((int *)iocb->private);
3141 3078
3142 if (overwrite) { 3079 if (overwrite) {
3143 atomic_inc(&inode->i_dio_count);
3144 down_read(&EXT4_I(inode)->i_data_sem); 3080 down_read(&EXT4_I(inode)->i_data_sem);
3145 mutex_unlock(&inode->i_mutex); 3081 mutex_unlock(&inode->i_mutex);
3146 } 3082 }
@@ -3167,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3167 iocb->private = NULL; 3103 iocb->private = NULL;
3168 ext4_inode_aio_set(inode, NULL); 3104 ext4_inode_aio_set(inode, NULL);
3169 if (!is_sync_kiocb(iocb)) { 3105 if (!is_sync_kiocb(iocb)) {
3170 ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3106 io_end = ext4_init_io_end(inode, GFP_NOFS);
3171 if (!io_end) { 3107 if (!io_end) {
3172 ret = -ENOMEM; 3108 ret = -ENOMEM;
3173 goto retake_lock; 3109 goto retake_lock;
3174 } 3110 }
3175 io_end->flag |= EXT4_IO_END_DIRECT; 3111 io_end->flag |= EXT4_IO_END_DIRECT;
3176 iocb->private = io_end; 3112 /*
3113 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
3114 */
3115 iocb->private = ext4_get_io_end(io_end);
3177 /* 3116 /*
3178 * we save the io structure for current async direct 3117 * we save the io structure for current async direct
3179 * IO, so that later ext4_map_blocks() could flag the 3118 * IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3197 NULL, 3136 NULL,
3198 dio_flags); 3137 dio_flags);
3199 3138
3200 if (iocb->private)
3201 ext4_inode_aio_set(inode, NULL);
3202 /* 3139 /*
3203 * The io_end structure takes a reference to the inode, that 3140 * Put our reference to io_end. This can free the io_end structure e.g.
3204 * structure needs to be destroyed and the reference to the 3141 * in sync IO case or in case of error. It can even perform extent
3205 * inode need to be dropped, when IO is complete, even with 0 3142 * conversion if all bios we submitted finished before we got here.
3206 * byte write, or failed. 3143 * Note that in that case iocb->private can be already set to NULL
3207 * 3144 * here.
3208 * In the successful AIO DIO case, the io_end structure will
3209 * be destroyed and the reference to the inode will be dropped
3210 * after the end_io call back function is called.
3211 *
3212 * In the case there is 0 byte write, or error case, since VFS
3213 * direct IO won't invoke the end_io call back function, we
3214 * need to free the end_io structure here.
3215 */ 3145 */
3216 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3146 if (io_end) {
3217 ext4_free_io_end(iocb->private); 3147 ext4_inode_aio_set(inode, NULL);
3218 iocb->private = NULL; 3148 ext4_put_io_end(io_end);
3219 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3149 /*
3150 * When no IO was submitted ext4_end_io_dio() was not
3151 * called so we have to put iocb's reference.
3152 */
3153 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3154 WARN_ON(iocb->private != io_end);
3155 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3156 WARN_ON(io_end->iocb);
3157 /*
3158 * Generic code already did inode_dio_done() so we
3159 * have to clear EXT4_IO_END_DIRECT to not do it for
3160 * the second time.
3161 */
3162 io_end->flag = 0;
3163 ext4_put_io_end(io_end);
3164 iocb->private = NULL;
3165 }
3166 }
3167 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3220 EXT4_STATE_DIO_UNWRITTEN)) { 3168 EXT4_STATE_DIO_UNWRITTEN)) {
3221 int err; 3169 int err;
3222 /* 3170 /*
3223 * for non AIO case, since the IO is already 3171 * for non AIO case, since the IO is already
3224 * completed, we could do the conversion right here 3172 * completed, we could do the conversion right here
3225 */ 3173 */
3226 err = ext4_convert_unwritten_extents(inode, 3174 err = ext4_convert_unwritten_extents(NULL, inode,
3227 offset, ret); 3175 offset, ret);
3228 if (err < 0) 3176 if (err < 0)
3229 ret = err; 3177 ret = err;
@@ -3231,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3231 } 3179 }
3232 3180
3233retake_lock: 3181retake_lock:
3182 if (rw == WRITE)
3183 inode_dio_done(inode);
3234 /* take i_mutex locking again if we do a ovewrite dio */ 3184 /* take i_mutex locking again if we do a ovewrite dio */
3235 if (overwrite) { 3185 if (overwrite) {
3236 inode_dio_done(inode);
3237 up_read(&EXT4_I(inode)->i_data_sem); 3186 up_read(&EXT4_I(inode)->i_data_sem);
3238 mutex_lock(&inode->i_mutex); 3187 mutex_lock(&inode->i_mutex);
3239 } 3188 }
@@ -3292,6 +3241,7 @@ static const struct address_space_operations ext4_aops = {
3292 .readpage = ext4_readpage, 3241 .readpage = ext4_readpage,
3293 .readpages = ext4_readpages, 3242 .readpages = ext4_readpages,
3294 .writepage = ext4_writepage, 3243 .writepage = ext4_writepage,
3244 .writepages = ext4_writepages,
3295 .write_begin = ext4_write_begin, 3245 .write_begin = ext4_write_begin,
3296 .write_end = ext4_write_end, 3246 .write_end = ext4_write_end,
3297 .bmap = ext4_bmap, 3247 .bmap = ext4_bmap,
@@ -3307,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3307 .readpage = ext4_readpage, 3257 .readpage = ext4_readpage,
3308 .readpages = ext4_readpages, 3258 .readpages = ext4_readpages,
3309 .writepage = ext4_writepage, 3259 .writepage = ext4_writepage,
3260 .writepages = ext4_writepages,
3310 .write_begin = ext4_write_begin, 3261 .write_begin = ext4_write_begin,
3311 .write_end = ext4_journalled_write_end, 3262 .write_end = ext4_journalled_write_end,
3312 .set_page_dirty = ext4_journalled_set_page_dirty, 3263 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3322,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = {
3322 .readpage = ext4_readpage, 3273 .readpage = ext4_readpage,
3323 .readpages = ext4_readpages, 3274 .readpages = ext4_readpages,
3324 .writepage = ext4_writepage, 3275 .writepage = ext4_writepage,
3325 .writepages = ext4_da_writepages, 3276 .writepages = ext4_writepages,
3326 .write_begin = ext4_da_write_begin, 3277 .write_begin = ext4_da_write_begin,
3327 .write_end = ext4_da_write_end, 3278 .write_end = ext4_da_write_end,
3328 .bmap = ext4_bmap, 3279 .bmap = ext4_bmap,
@@ -3355,89 +3306,56 @@ void ext4_set_aops(struct inode *inode)
3355 inode->i_mapping->a_ops = &ext4_aops; 3306 inode->i_mapping->a_ops = &ext4_aops;
3356} 3307}
3357 3308
3358
3359/* 3309/*
3360 * ext4_discard_partial_page_buffers() 3310 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3361 * Wrapper function for ext4_discard_partial_page_buffers_no_lock. 3311 * up to the end of the block which corresponds to `from'.
3362 * This function finds and locks the page containing the offset 3312 * This required during truncate. We need to physically zero the tail end
3363 * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. 3313 * of that block so it doesn't yield old data if the file is later grown.
3364 * Calling functions that already have the page locked should call
3365 * ext4_discard_partial_page_buffers_no_lock directly.
3366 */ 3314 */
3367int ext4_discard_partial_page_buffers(handle_t *handle, 3315int ext4_block_truncate_page(handle_t *handle,
3368 struct address_space *mapping, loff_t from, 3316 struct address_space *mapping, loff_t from)
3369 loff_t length, int flags)
3370{ 3317{
3318 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3319 unsigned length;
3320 unsigned blocksize;
3371 struct inode *inode = mapping->host; 3321 struct inode *inode = mapping->host;
3372 struct page *page;
3373 int err = 0;
3374 3322
3375 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3323 blocksize = inode->i_sb->s_blocksize;
3376 mapping_gfp_mask(mapping) & ~__GFP_FS); 3324 length = blocksize - (offset & (blocksize - 1));
3377 if (!page)
3378 return -ENOMEM;
3379
3380 err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
3381 from, length, flags);
3382 3325
3383 unlock_page(page); 3326 return ext4_block_zero_page_range(handle, mapping, from, length);
3384 page_cache_release(page);
3385 return err;
3386} 3327}
3387 3328
3388/* 3329/*
3389 * ext4_discard_partial_page_buffers_no_lock() 3330 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3390 * Zeros a page range of length 'length' starting from offset 'from'. 3331 * starting from file offset 'from'. The range to be zero'd must
3391 * Buffer heads that correspond to the block aligned regions of the 3332 * be contained with in one block. If the specified range exceeds
3392 * zeroed range will be unmapped. Unblock aligned regions 3333 * the end of the block it will be shortened to end of the block
3393 * will have the corresponding buffer head mapped if needed so that 3334 * that cooresponds to 'from'
3394 * that region of the page can be updated with the partial zero out.
3395 *
3396 * This function assumes that the page has already been locked. The
3397 * The range to be discarded must be contained with in the given page.
3398 * If the specified range exceeds the end of the page it will be shortened
3399 * to the end of the page that corresponds to 'from'. This function is
3400 * appropriate for updating a page and it buffer heads to be unmapped and
3401 * zeroed for blocks that have been either released, or are going to be
3402 * released.
3403 *
3404 * handle: The journal handle
3405 * inode: The files inode
3406 * page: A locked page that contains the offset "from"
3407 * from: The starting byte offset (from the beginning of the file)
3408 * to begin discarding
3409 * len: The length of bytes to discard
3410 * flags: Optional flags that may be used:
3411 *
3412 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3413 * Only zero the regions of the page whose buffer heads
3414 * have already been unmapped. This flag is appropriate
3415 * for updating the contents of a page whose blocks may
3416 * have already been released, and we only want to zero
3417 * out the regions that correspond to those released blocks.
3418 *
3419 * Returns zero on success or negative on failure.
3420 */ 3335 */
3421static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 3336int ext4_block_zero_page_range(handle_t *handle,
3422 struct inode *inode, struct page *page, loff_t from, 3337 struct address_space *mapping, loff_t from, loff_t length)
3423 loff_t length, int flags)
3424{ 3338{
3425 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3339 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3426 unsigned int offset = from & (PAGE_CACHE_SIZE-1); 3340 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3427 unsigned int blocksize, max, pos; 3341 unsigned blocksize, max, pos;
3428 ext4_lblk_t iblock; 3342 ext4_lblk_t iblock;
3343 struct inode *inode = mapping->host;
3429 struct buffer_head *bh; 3344 struct buffer_head *bh;
3345 struct page *page;
3430 int err = 0; 3346 int err = 0;
3431 3347
3432 blocksize = inode->i_sb->s_blocksize; 3348 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3433 max = PAGE_CACHE_SIZE - offset; 3349 mapping_gfp_mask(mapping) & ~__GFP_FS);
3350 if (!page)
3351 return -ENOMEM;
3434 3352
3435 if (index != page->index) 3353 blocksize = inode->i_sb->s_blocksize;
3436 return -EINVAL; 3354 max = blocksize - (offset & (blocksize - 1));
3437 3355
3438 /* 3356 /*
3439 * correct length if it does not fall between 3357 * correct length if it does not fall between
3440 * 'from' and the end of the page 3358 * 'from' and the end of the block
3441 */ 3359 */
3442 if (length > max || length < 0) 3360 if (length > max || length < 0)
3443 length = max; 3361 length = max;
@@ -3455,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3455 iblock++; 3373 iblock++;
3456 pos += blocksize; 3374 pos += blocksize;
3457 } 3375 }
3458 3376 if (buffer_freed(bh)) {
3459 pos = offset; 3377 BUFFER_TRACE(bh, "freed: skip");
3460 while (pos < offset + length) { 3378 goto unlock;
3461 unsigned int end_of_block, range_to_discard; 3379 }
3462 3380 if (!buffer_mapped(bh)) {
3463 err = 0; 3381 BUFFER_TRACE(bh, "unmapped");
3464 3382 ext4_get_block(inode, iblock, bh, 0);
3465 /* The length of space left to zero and unmap */ 3383 /* unmapped? It's a hole - nothing to do */
3466 range_to_discard = offset + length - pos;
3467
3468 /* The length of space until the end of the block */
3469 end_of_block = blocksize - (pos & (blocksize-1));
3470
3471 /*
3472 * Do not unmap or zero past end of block
3473 * for this buffer head
3474 */
3475 if (range_to_discard > end_of_block)
3476 range_to_discard = end_of_block;
3477
3478
3479 /*
3480 * Skip this buffer head if we are only zeroing unampped
3481 * regions of the page
3482 */
3483 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3484 buffer_mapped(bh))
3485 goto next;
3486
3487 /* If the range is block aligned, unmap */
3488 if (range_to_discard == blocksize) {
3489 clear_buffer_dirty(bh);
3490 bh->b_bdev = NULL;
3491 clear_buffer_mapped(bh);
3492 clear_buffer_req(bh);
3493 clear_buffer_new(bh);
3494 clear_buffer_delay(bh);
3495 clear_buffer_unwritten(bh);
3496 clear_buffer_uptodate(bh);
3497 zero_user(page, pos, range_to_discard);
3498 BUFFER_TRACE(bh, "Buffer discarded");
3499 goto next;
3500 }
3501
3502 /*
3503 * If this block is not completely contained in the range
3504 * to be discarded, then it is not going to be released. Because
3505 * we need to keep this block, we need to make sure this part
3506 * of the page is uptodate before we modify it by writeing
3507 * partial zeros on it.
3508 */
3509 if (!buffer_mapped(bh)) { 3384 if (!buffer_mapped(bh)) {
3510 /* 3385 BUFFER_TRACE(bh, "still unmapped");
3511 * Buffer head must be mapped before we can read 3386 goto unlock;
3512 * from the block
3513 */
3514 BUFFER_TRACE(bh, "unmapped");
3515 ext4_get_block(inode, iblock, bh, 0);
3516 /* unmapped? It's a hole - nothing to do */
3517 if (!buffer_mapped(bh)) {
3518 BUFFER_TRACE(bh, "still unmapped");
3519 goto next;
3520 }
3521 } 3387 }
3388 }
3522 3389
3523 /* Ok, it's mapped. Make sure it's up-to-date */ 3390 /* Ok, it's mapped. Make sure it's up-to-date */
3524 if (PageUptodate(page)) 3391 if (PageUptodate(page))
3525 set_buffer_uptodate(bh); 3392 set_buffer_uptodate(bh);
3526 3393
3527 if (!buffer_uptodate(bh)) { 3394 if (!buffer_uptodate(bh)) {
3528 err = -EIO; 3395 err = -EIO;
3529 ll_rw_block(READ, 1, &bh); 3396 ll_rw_block(READ, 1, &bh);
3530 wait_on_buffer(bh); 3397 wait_on_buffer(bh);
3531 /* Uhhuh. Read error. Complain and punt.*/ 3398 /* Uhhuh. Read error. Complain and punt. */
3532 if (!buffer_uptodate(bh)) 3399 if (!buffer_uptodate(bh))
3533 goto next; 3400 goto unlock;
3534 } 3401 }
3402 if (ext4_should_journal_data(inode)) {
3403 BUFFER_TRACE(bh, "get write access");
3404 err = ext4_journal_get_write_access(handle, bh);
3405 if (err)
3406 goto unlock;
3407 }
3408 zero_user(page, offset, length);
3409 BUFFER_TRACE(bh, "zeroed end of block");
3535 3410
3536 if (ext4_should_journal_data(inode)) { 3411 if (ext4_should_journal_data(inode)) {
3537 BUFFER_TRACE(bh, "get write access"); 3412 err = ext4_handle_dirty_metadata(handle, inode, bh);
3538 err = ext4_journal_get_write_access(handle, bh); 3413 } else {
3539 if (err) 3414 err = 0;
3540 goto next; 3415 mark_buffer_dirty(bh);
3541 } 3416 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
3417 err = ext4_jbd2_file_inode(handle, inode);
3418 }
3419
3420unlock:
3421 unlock_page(page);
3422 page_cache_release(page);
3423 return err;
3424}
3542 3425
3543 zero_user(page, pos, range_to_discard); 3426int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3427 loff_t lstart, loff_t length)
3428{
3429 struct super_block *sb = inode->i_sb;
3430 struct address_space *mapping = inode->i_mapping;
3431 unsigned partial_start, partial_end;
3432 ext4_fsblk_t start, end;
3433 loff_t byte_end = (lstart + length - 1);
3434 int err = 0;
3544 3435
3545 err = 0; 3436 partial_start = lstart & (sb->s_blocksize - 1);
3546 if (ext4_should_journal_data(inode)) { 3437 partial_end = byte_end & (sb->s_blocksize - 1);
3547 err = ext4_handle_dirty_metadata(handle, inode, bh);
3548 } else
3549 mark_buffer_dirty(bh);
3550 3438
3551 BUFFER_TRACE(bh, "Partial buffer zeroed"); 3439 start = lstart >> sb->s_blocksize_bits;
3552next: 3440 end = byte_end >> sb->s_blocksize_bits;
3553 bh = bh->b_this_page;
3554 iblock++;
3555 pos += range_to_discard;
3556 }
3557 3441
3442 /* Handle partial zero within the single block */
3443 if (start == end &&
3444 (partial_start || (partial_end != sb->s_blocksize - 1))) {
3445 err = ext4_block_zero_page_range(handle, mapping,
3446 lstart, length);
3447 return err;
3448 }
3449 /* Handle partial zero out on the start of the range */
3450 if (partial_start) {
3451 err = ext4_block_zero_page_range(handle, mapping,
3452 lstart, sb->s_blocksize);
3453 if (err)
3454 return err;
3455 }
3456 /* Handle partial zero out on the end of the range */
3457 if (partial_end != sb->s_blocksize - 1)
3458 err = ext4_block_zero_page_range(handle, mapping,
3459 byte_end - partial_end,
3460 partial_end + 1);
3558 return err; 3461 return err;
3559} 3462}
3560 3463
@@ -3580,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode)
3580 * Returns: 0 on success or negative on failure 3483 * Returns: 0 on success or negative on failure
3581 */ 3484 */
3582 3485
3583int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3486int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3584{ 3487{
3585 struct inode *inode = file_inode(file);
3586 struct super_block *sb = inode->i_sb; 3488 struct super_block *sb = inode->i_sb;
3587 ext4_lblk_t first_block, stop_block; 3489 ext4_lblk_t first_block, stop_block;
3588 struct address_space *mapping = inode->i_mapping; 3490 struct address_space *mapping = inode->i_mapping;
3589 loff_t first_page, last_page, page_len; 3491 loff_t first_block_offset, last_block_offset;
3590 loff_t first_page_offset, last_page_offset;
3591 handle_t *handle; 3492 handle_t *handle;
3592 unsigned int credits; 3493 unsigned int credits;
3593 int ret = 0; 3494 int ret = 0;
@@ -3638,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3638 offset; 3539 offset;
3639 } 3540 }
3640 3541
3641 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 3542 first_block_offset = round_up(offset, sb->s_blocksize);
3642 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 3543 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
3643 3544
3644 first_page_offset = first_page << PAGE_CACHE_SHIFT; 3545 /* Now release the pages and zero block aligned part of pages*/
3645 last_page_offset = last_page << PAGE_CACHE_SHIFT; 3546 if (last_block_offset > first_block_offset)
3646 3547 truncate_pagecache_range(inode, first_block_offset,
3647 /* Now release the pages */ 3548 last_block_offset);
3648 if (last_page_offset > first_page_offset) {
3649 truncate_pagecache_range(inode, first_page_offset,
3650 last_page_offset - 1);
3651 }
3652 3549
3653 /* Wait all existing dio workers, newcomers will block on i_mutex */ 3550 /* Wait all existing dio workers, newcomers will block on i_mutex */
3654 ext4_inode_block_unlocked_dio(inode); 3551 ext4_inode_block_unlocked_dio(inode);
3655 ret = ext4_flush_unwritten_io(inode);
3656 if (ret)
3657 goto out_dio;
3658 inode_dio_wait(inode); 3552 inode_dio_wait(inode);
3659 3553
3660 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3554 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3668 goto out_dio; 3562 goto out_dio;
3669 } 3563 }
3670 3564
3671 /* 3565 ret = ext4_zero_partial_blocks(handle, inode, offset,
3672 * Now we need to zero out the non-page-aligned data in the 3566 length);
3673 * pages at the start and tail of the hole, and unmap the 3567 if (ret)
3674 * buffer heads for the block aligned regions of the page that 3568 goto out_stop;
3675 * were completely zeroed.
3676 */
3677 if (first_page > last_page) {
3678 /*
3679 * If the file space being truncated is contained
3680 * within a page just zero out and unmap the middle of
3681 * that page
3682 */
3683 ret = ext4_discard_partial_page_buffers(handle,
3684 mapping, offset, length, 0);
3685
3686 if (ret)
3687 goto out_stop;
3688 } else {
3689 /*
3690 * zero out and unmap the partial page that contains
3691 * the start of the hole
3692 */
3693 page_len = first_page_offset - offset;
3694 if (page_len > 0) {
3695 ret = ext4_discard_partial_page_buffers(handle, mapping,
3696 offset, page_len, 0);
3697 if (ret)
3698 goto out_stop;
3699 }
3700
3701 /*
3702 * zero out and unmap the partial page that contains
3703 * the end of the hole
3704 */
3705 page_len = offset + length - last_page_offset;
3706 if (page_len > 0) {
3707 ret = ext4_discard_partial_page_buffers(handle, mapping,
3708 last_page_offset, page_len, 0);
3709 if (ret)
3710 goto out_stop;
3711 }
3712 }
3713
3714 /*
3715 * If i_size is contained in the last page, we need to
3716 * unmap and zero the partial page after i_size
3717 */
3718 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
3719 inode->i_size % PAGE_CACHE_SIZE != 0) {
3720 page_len = PAGE_CACHE_SIZE -
3721 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3722
3723 if (page_len > 0) {
3724 ret = ext4_discard_partial_page_buffers(handle,
3725 mapping, inode->i_size, page_len, 0);
3726
3727 if (ret)
3728 goto out_stop;
3729 }
3730 }
3731 3569
3732 first_block = (offset + sb->s_blocksize - 1) >> 3570 first_block = (offset + sb->s_blocksize - 1) >>
3733 EXT4_BLOCK_SIZE_BITS(sb); 3571 EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3641,6 @@ void ext4_truncate(struct inode *inode)
3803 unsigned int credits; 3641 unsigned int credits;
3804 handle_t *handle; 3642 handle_t *handle;
3805 struct address_space *mapping = inode->i_mapping; 3643 struct address_space *mapping = inode->i_mapping;
3806 loff_t page_len;
3807 3644
3808 /* 3645 /*
3809 * There is a possibility that we're either freeing the inode 3646 * There is a possibility that we're either freeing the inode
@@ -3830,12 +3667,6 @@ void ext4_truncate(struct inode *inode)
3830 return; 3667 return;
3831 } 3668 }
3832 3669
3833 /*
3834 * finish any pending end_io work so we won't run the risk of
3835 * converting any truncated blocks to initialized later
3836 */
3837 ext4_flush_unwritten_io(inode);
3838
3839 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3670 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3840 credits = ext4_writepage_trans_blocks(inode); 3671 credits = ext4_writepage_trans_blocks(inode);
3841 else 3672 else
@@ -3847,14 +3678,8 @@ void ext4_truncate(struct inode *inode)
3847 return; 3678 return;
3848 } 3679 }
3849 3680
3850 if (inode->i_size % PAGE_CACHE_SIZE != 0) { 3681 if (inode->i_size & (inode->i_sb->s_blocksize - 1))
3851 page_len = PAGE_CACHE_SIZE - 3682 ext4_block_truncate_page(handle, mapping, inode->i_size);
3852 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3853
3854 if (ext4_discard_partial_page_buffers(handle,
3855 mapping, inode->i_size, page_len, 0))
3856 goto out_stop;
3857 }
3858 3683
3859 /* 3684 /*
3860 * We add the inode to the orphan list, so that if this 3685 * We add the inode to the orphan list, so that if this
@@ -4623,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
4623 inode->i_size >> PAGE_CACHE_SHIFT); 4448 inode->i_size >> PAGE_CACHE_SHIFT);
4624 if (!page) 4449 if (!page)
4625 return; 4450 return;
4626 ret = __ext4_journalled_invalidatepage(page, offset); 4451 ret = __ext4_journalled_invalidatepage(page, offset,
4452 PAGE_CACHE_SIZE - offset);
4627 unlock_page(page); 4453 unlock_page(page);
4628 page_cache_release(page); 4454 page_cache_release(page);
4629 if (ret != -EBUSY) 4455 if (ret != -EBUSY)
@@ -4805,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4805 struct kstat *stat) 4631 struct kstat *stat)
4806{ 4632{
4807 struct inode *inode; 4633 struct inode *inode;
4808 unsigned long delalloc_blocks; 4634 unsigned long long delalloc_blocks;
4809 4635
4810 inode = dentry->d_inode; 4636 inode = dentry->d_inode;
4811 generic_fillattr(inode, stat); 4637 generic_fillattr(inode, stat);
@@ -4823,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4823 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 4649 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
4824 EXT4_I(inode)->i_reserved_data_blocks); 4650 EXT4_I(inode)->i_reserved_data_blocks);
4825 4651
4826 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4652 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
4827 return 0; 4653 return 0;
4828} 4654}
4829 4655
4830static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4656static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
4657 int pextents)
4831{ 4658{
4832 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4659 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4833 return ext4_ind_trans_blocks(inode, nrblocks, chunk); 4660 return ext4_ind_trans_blocks(inode, lblocks);
4834 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4661 return ext4_ext_index_trans_blocks(inode, pextents);
4835} 4662}
4836 4663
4837/* 4664/*
@@ -4845,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4845 * 4672 *
4846 * Also account for superblock, inode, quota and xattr blocks 4673 * Also account for superblock, inode, quota and xattr blocks
4847 */ 4674 */
4848static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4675static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
4676 int pextents)
4849{ 4677{
4850 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 4678 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
4851 int gdpblocks; 4679 int gdpblocks;
@@ -4853,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4853 int ret = 0; 4681 int ret = 0;
4854 4682
4855 /* 4683 /*
4856 * How many index blocks need to touch to modify nrblocks? 4684 * How many index blocks need to touch to map @lblocks logical blocks
4857 * The "Chunk" flag indicating whether the nrblocks is 4685 * to @pextents physical extents?
4858 * physically contiguous on disk
4859 *
4860 * For Direct IO and fallocate, they calls get_block to allocate
4861 * one single extent at a time, so they could set the "Chunk" flag
4862 */ 4686 */
4863 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4687 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
4864 4688
4865 ret = idxblocks; 4689 ret = idxblocks;
4866 4690
@@ -4868,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4868 * Now let's see how many group bitmaps and group descriptors need 4692 * Now let's see how many group bitmaps and group descriptors need
4869 * to account 4693 * to account
4870 */ 4694 */
4871 groups = idxblocks; 4695 groups = idxblocks + pextents;
4872 if (chunk)
4873 groups += 1;
4874 else
4875 groups += nrblocks;
4876
4877 gdpblocks = groups; 4696 gdpblocks = groups;
4878 if (groups > ngroups) 4697 if (groups > ngroups)
4879 groups = ngroups; 4698 groups = ngroups;
@@ -4904,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
4904 int bpp = ext4_journal_blocks_per_page(inode); 4723 int bpp = ext4_journal_blocks_per_page(inode);
4905 int ret; 4724 int ret;
4906 4725
4907 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4726 ret = ext4_meta_trans_blocks(inode, bpp, bpp);
4908 4727
4909 /* Account for data blocks for journalled mode */ 4728 /* Account for data blocks for journalled mode */
4910 if (ext4_should_journal_data(inode)) 4729 if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def84082a9a9..a9ff5e5137ca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,6 +2105,7 @@ repeat:
2105 group = ac->ac_g_ex.fe_group; 2105 group = ac->ac_g_ex.fe_group;
2106 2106
2107 for (i = 0; i < ngroups; group++, i++) { 2107 for (i = 0; i < ngroups; group++, i++) {
2108 cond_resched();
2108 /* 2109 /*
2109 * Artificially restricted ngroups for non-extent 2110 * Artificially restricted ngroups for non-extent
2110 * files makes group > ngroups possible on first loop. 2111 * files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4405repeat: 4406repeat:
4406 /* allocate space in core */ 4407 /* allocate space in core */
4407 *errp = ext4_mb_regular_allocator(ac); 4408 *errp = ext4_mb_regular_allocator(ac);
4408 if (*errp) { 4409 if (*errp)
4409 ext4_discard_allocated_blocks(ac); 4410 goto discard_and_exit;
4410 goto errout;
4411 }
4412 4411
4413 /* as we've just preallocated more space than 4412 /* as we've just preallocated more space than
4414 * user requested orinally, we store allocated 4413 * user requested originally, we store allocated
4415 * space in a special descriptor */ 4414 * space in a special descriptor */
4416 if (ac->ac_status == AC_STATUS_FOUND && 4415 if (ac->ac_status == AC_STATUS_FOUND &&
4417 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4416 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4418 ext4_mb_new_preallocation(ac); 4417 *errp = ext4_mb_new_preallocation(ac);
4418 if (*errp) {
4419 discard_and_exit:
4420 ext4_discard_allocated_blocks(ac);
4421 goto errout;
4422 }
4419 } 4423 }
4420 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4424 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4421 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 4425 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4612 BUG_ON(bh && (count > 1)); 4616 BUG_ON(bh && (count > 1));
4613 4617
4614 for (i = 0; i < count; i++) { 4618 for (i = 0; i < count; i++) {
4619 cond_resched();
4615 if (!bh) 4620 if (!bh)
4616 tbh = sb_find_get_block(inode->i_sb, 4621 tbh = sb_find_get_block(inode->i_sb,
4617 block + i); 4622 block + i);
4618 if (unlikely(!tbh)) 4623 if (!tbh)
4619 continue; 4624 continue;
4620 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4625 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4621 inode, tbh, block + i); 4626 inode, tbh, block + i);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
912 struct page *pagep[2] = {NULL, NULL}; 912 struct page *pagep[2] = {NULL, NULL};
913 handle_t *handle; 913 handle_t *handle;
914 ext4_lblk_t orig_blk_offset; 914 ext4_lblk_t orig_blk_offset;
915 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
916 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 915 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
917 unsigned int w_flags = 0; 916 unsigned int w_flags = 0;
918 unsigned int tmp_data_size, data_size, replaced_size; 917 unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
940 orig_blk_offset = orig_page_offset * blocks_per_page + 939 orig_blk_offset = orig_page_offset * blocks_per_page +
941 data_offset_in_page; 940 data_offset_in_page;
942 941
943 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
944
945 /* Calculate data_size */ 942 /* Calculate data_size */
946 if ((orig_blk_offset + block_len_in_page - 1) == 943 if ((orig_blk_offset + block_len_in_page - 1) ==
947 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 944 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc35ecb7..ab2f6dc44b3a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
918 bh->b_data, bh->b_size, 918 bh->b_data, bh->b_size,
919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
920 + ((char *)de - bh->b_data))) { 920 + ((char *)de - bh->b_data))) {
921 /* On error, skip the f_pos to the next block. */ 921 /* silently ignore the rest of the block */
922 dir_file->f_pos = (dir_file->f_pos | 922 break;
923 (dir->i_sb->s_blocksize - 1)) + 1;
924 brelse(bh);
925 return count;
926 } 923 }
927 ext4fs_dirhash(de->name, de->name_len, hinfo); 924 ext4fs_dirhash(de->name, de->name_len, hinfo);
928 if ((hinfo->hash < start_hash) || 925 if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..48786cdb5e6c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -46,46 +46,121 @@ void ext4_exit_pageio(void)
46} 46}
47 47
48/* 48/*
49 * This function is called by ext4_evict_inode() to make sure there is 49 * Print an buffer I/O error compatible with the fs/buffer.c. This
50 * no more pending I/O completion work left to do. 50 * provides compatibility with dmesg scrapers that look for a specific
51 * buffer I/O error message. We really need a unified error reporting
52 * structure to userspace ala Digital Unix's uerf system, but it's
53 * probably not going to happen in my lifetime, due to LKML politics...
51 */ 54 */
52void ext4_ioend_shutdown(struct inode *inode) 55static void buffer_io_error(struct buffer_head *bh)
53{ 56{
54 wait_queue_head_t *wq = ext4_ioend_wq(inode); 57 char b[BDEVNAME_SIZE];
58 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
59 bdevname(bh->b_bdev, b),
60 (unsigned long long)bh->b_blocknr);
61}
55 62
56 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 63static void ext4_finish_bio(struct bio *bio)
57 /* 64{
58 * We need to make sure the work structure is finished being 65 int i;
59 * used before we let the inode get destroyed. 66 int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
60 */ 67
61 if (work_pending(&EXT4_I(inode)->i_unwritten_work)) 68 for (i = 0; i < bio->bi_vcnt; i++) {
62 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 69 struct bio_vec *bvec = &bio->bi_io_vec[i];
70 struct page *page = bvec->bv_page;
71 struct buffer_head *bh, *head;
72 unsigned bio_start = bvec->bv_offset;
73 unsigned bio_end = bio_start + bvec->bv_len;
74 unsigned under_io = 0;
75 unsigned long flags;
76
77 if (!page)
78 continue;
79
80 if (error) {
81 SetPageError(page);
82 set_bit(AS_EIO, &page->mapping->flags);
83 }
84 bh = head = page_buffers(page);
85 /*
86 * We check all buffers in the page under BH_Uptodate_Lock
87 * to avoid races with other end io clearing async_write flags
88 */
89 local_irq_save(flags);
90 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
91 do {
92 if (bh_offset(bh) < bio_start ||
93 bh_offset(bh) + bh->b_size > bio_end) {
94 if (buffer_async_write(bh))
95 under_io++;
96 continue;
97 }
98 clear_buffer_async_write(bh);
99 if (error)
100 buffer_io_error(bh);
101 } while ((bh = bh->b_this_page) != head);
102 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
103 local_irq_restore(flags);
104 if (!under_io)
105 end_page_writeback(page);
106 }
63} 107}
64 108
65void ext4_free_io_end(ext4_io_end_t *io) 109static void ext4_release_io_end(ext4_io_end_t *io_end)
66{ 110{
67 BUG_ON(!io); 111 struct bio *bio, *next_bio;
68 BUG_ON(!list_empty(&io->list)); 112
69 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); 113 BUG_ON(!list_empty(&io_end->list));
114 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
115 WARN_ON(io_end->handle);
70 116
71 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 117 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
72 wake_up_all(ext4_ioend_wq(io->inode)); 118 wake_up_all(ext4_ioend_wq(io_end->inode));
73 kmem_cache_free(io_end_cachep, io); 119
120 for (bio = io_end->bio; bio; bio = next_bio) {
121 next_bio = bio->bi_private;
122 ext4_finish_bio(bio);
123 bio_put(bio);
124 }
125 if (io_end->flag & EXT4_IO_END_DIRECT)
126 inode_dio_done(io_end->inode);
127 if (io_end->iocb)
128 aio_complete(io_end->iocb, io_end->result, 0);
129 kmem_cache_free(io_end_cachep, io_end);
74} 130}
75 131
76/* check a range of space and convert unwritten extents to written. */ 132static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
133{
134 struct inode *inode = io_end->inode;
135
136 io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
137 /* Wake up anyone waiting on unwritten extent conversion */
138 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
139 wake_up_all(ext4_ioend_wq(inode));
140}
141
142/*
143 * Check a range of space and convert unwritten extents to written. Note that
144 * we are protected from truncate touching same part of extent tree by the
145 * fact that truncate code waits for all DIO to finish (thus exclusion from
146 * direct IO is achieved) and also waits for PageWriteback bits. Thus we
147 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
148 * completed (happens from ext4_free_ioend()).
149 */
77static int ext4_end_io(ext4_io_end_t *io) 150static int ext4_end_io(ext4_io_end_t *io)
78{ 151{
79 struct inode *inode = io->inode; 152 struct inode *inode = io->inode;
80 loff_t offset = io->offset; 153 loff_t offset = io->offset;
81 ssize_t size = io->size; 154 ssize_t size = io->size;
155 handle_t *handle = io->handle;
82 int ret = 0; 156 int ret = 0;
83 157
84 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 158 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
85 "list->prev 0x%p\n", 159 "list->prev 0x%p\n",
86 io, inode->i_ino, io->list.next, io->list.prev); 160 io, inode->i_ino, io->list.next, io->list.prev);
87 161
88 ret = ext4_convert_unwritten_extents(inode, offset, size); 162 io->handle = NULL; /* Following call will use up the handle */
163 ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
89 if (ret < 0) { 164 if (ret < 0) {
90 ext4_msg(inode->i_sb, KERN_EMERG, 165 ext4_msg(inode->i_sb, KERN_EMERG,
91 "failed to convert unwritten extents to written " 166 "failed to convert unwritten extents to written "
@@ -93,30 +168,22 @@ static int ext4_end_io(ext4_io_end_t *io)
93 "(inode %lu, offset %llu, size %zd, error %d)", 168 "(inode %lu, offset %llu, size %zd, error %d)",
94 inode->i_ino, offset, size, ret); 169 inode->i_ino, offset, size, ret);
95 } 170 }
96 /* Wake up anyone waiting on unwritten extent conversion */ 171 ext4_clear_io_unwritten_flag(io);
97 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 172 ext4_release_io_end(io);
98 wake_up_all(ext4_ioend_wq(inode));
99 if (io->flag & EXT4_IO_END_DIRECT)
100 inode_dio_done(inode);
101 if (io->iocb)
102 aio_complete(io->iocb, io->result, 0);
103 return ret; 173 return ret;
104} 174}
105 175
106static void dump_completed_IO(struct inode *inode) 176static void dump_completed_IO(struct inode *inode, struct list_head *head)
107{ 177{
108#ifdef EXT4FS_DEBUG 178#ifdef EXT4FS_DEBUG
109 struct list_head *cur, *before, *after; 179 struct list_head *cur, *before, *after;
110 ext4_io_end_t *io, *io0, *io1; 180 ext4_io_end_t *io, *io0, *io1;
111 181
112 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 182 if (list_empty(head))
113 ext4_debug("inode %lu completed_io list is empty\n",
114 inode->i_ino);
115 return; 183 return;
116 }
117 184
118 ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 185 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
119 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 186 list_for_each_entry(io, head, list) {
120 cur = &io->list; 187 cur = &io->list;
121 before = cur->prev; 188 before = cur->prev;
122 io0 = container_of(before, ext4_io_end_t, list); 189 io0 = container_of(before, ext4_io_end_t, list);
@@ -130,23 +197,30 @@ static void dump_completed_IO(struct inode *inode)
130} 197}
131 198
132/* Add the io_end to per-inode completed end_io list. */ 199/* Add the io_end to per-inode completed end_io list. */
133void ext4_add_complete_io(ext4_io_end_t *io_end) 200static void ext4_add_complete_io(ext4_io_end_t *io_end)
134{ 201{
135 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 202 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
136 struct workqueue_struct *wq; 203 struct workqueue_struct *wq;
137 unsigned long flags; 204 unsigned long flags;
138 205
139 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 206 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
140 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
141
142 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 207 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
143 if (list_empty(&ei->i_completed_io_list)) 208 if (io_end->handle) {
144 queue_work(wq, &ei->i_unwritten_work); 209 wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
145 list_add_tail(&io_end->list, &ei->i_completed_io_list); 210 if (list_empty(&ei->i_rsv_conversion_list))
211 queue_work(wq, &ei->i_rsv_conversion_work);
212 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
213 } else {
214 wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
215 if (list_empty(&ei->i_unrsv_conversion_list))
216 queue_work(wq, &ei->i_unrsv_conversion_work);
217 list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
218 }
146 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 219 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
147} 220}
148 221
149static int ext4_do_flush_completed_IO(struct inode *inode) 222static int ext4_do_flush_completed_IO(struct inode *inode,
223 struct list_head *head)
150{ 224{
151 ext4_io_end_t *io; 225 ext4_io_end_t *io;
152 struct list_head unwritten; 226 struct list_head unwritten;
@@ -155,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
155 int err, ret = 0; 229 int err, ret = 0;
156 230
157 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 231 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
158 dump_completed_IO(inode); 232 dump_completed_IO(inode, head);
159 list_replace_init(&ei->i_completed_io_list, &unwritten); 233 list_replace_init(head, &unwritten);
160 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 234 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
161 235
162 while (!list_empty(&unwritten)) { 236 while (!list_empty(&unwritten)) {
@@ -167,30 +241,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
167 err = ext4_end_io(io); 241 err = ext4_end_io(io);
168 if (unlikely(!ret && err)) 242 if (unlikely(!ret && err))
169 ret = err; 243 ret = err;
170 io->flag &= ~EXT4_IO_END_UNWRITTEN;
171 ext4_free_io_end(io);
172 } 244 }
173 return ret; 245 return ret;
174} 246}
175 247
176/* 248/*
177 * work on completed aio dio IO, to convert unwritten extents to extents 249 * work on completed IO, to convert unwritten extents to extents
178 */ 250 */
179void ext4_end_io_work(struct work_struct *work) 251void ext4_end_io_rsv_work(struct work_struct *work)
180{ 252{
181 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 253 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
182 i_unwritten_work); 254 i_rsv_conversion_work);
183 ext4_do_flush_completed_IO(&ei->vfs_inode); 255 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
184} 256}
185 257
186int ext4_flush_unwritten_io(struct inode *inode) 258void ext4_end_io_unrsv_work(struct work_struct *work)
187{ 259{
188 int ret; 260 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
189 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && 261 i_unrsv_conversion_work);
190 !(inode->i_state & I_FREEING)); 262 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
191 ret = ext4_do_flush_completed_IO(inode);
192 ext4_unwritten_wait(inode);
193 return ret;
194} 263}
195 264
196ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 265ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -200,83 +269,70 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
200 atomic_inc(&EXT4_I(inode)->i_ioend_count); 269 atomic_inc(&EXT4_I(inode)->i_ioend_count);
201 io->inode = inode; 270 io->inode = inode;
202 INIT_LIST_HEAD(&io->list); 271 INIT_LIST_HEAD(&io->list);
272 atomic_set(&io->count, 1);
203 } 273 }
204 return io; 274 return io;
205} 275}
206 276
207/* 277void ext4_put_io_end_defer(ext4_io_end_t *io_end)
208 * Print an buffer I/O error compatible with the fs/buffer.c. This
209 * provides compatibility with dmesg scrapers that look for a specific
210 * buffer I/O error message. We really need a unified error reporting
211 * structure to userspace ala Digital Unix's uerf system, but it's
212 * probably not going to happen in my lifetime, due to LKML politics...
213 */
214static void buffer_io_error(struct buffer_head *bh)
215{ 278{
216 char b[BDEVNAME_SIZE]; 279 if (atomic_dec_and_test(&io_end->count)) {
217 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", 280 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
218 bdevname(bh->b_bdev, b), 281 ext4_release_io_end(io_end);
219 (unsigned long long)bh->b_blocknr); 282 return;
283 }
284 ext4_add_complete_io(io_end);
285 }
286}
287
288int ext4_put_io_end(ext4_io_end_t *io_end)
289{
290 int err = 0;
291
292 if (atomic_dec_and_test(&io_end->count)) {
293 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
294 err = ext4_convert_unwritten_extents(io_end->handle,
295 io_end->inode, io_end->offset,
296 io_end->size);
297 io_end->handle = NULL;
298 ext4_clear_io_unwritten_flag(io_end);
299 }
300 ext4_release_io_end(io_end);
301 }
302 return err;
303}
304
305ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
306{
307 atomic_inc(&io_end->count);
308 return io_end;
220} 309}
221 310
222static void ext4_end_bio(struct bio *bio, int error) 311static void ext4_end_bio(struct bio *bio, int error)
223{ 312{
224 ext4_io_end_t *io_end = bio->bi_private; 313 ext4_io_end_t *io_end = bio->bi_private;
225 struct inode *inode;
226 int i;
227 int blocksize;
228 sector_t bi_sector = bio->bi_sector; 314 sector_t bi_sector = bio->bi_sector;
229 315
230 BUG_ON(!io_end); 316 BUG_ON(!io_end);
231 inode = io_end->inode;
232 blocksize = 1 << inode->i_blkbits;
233 bio->bi_private = NULL;
234 bio->bi_end_io = NULL; 317 bio->bi_end_io = NULL;
235 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 318 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
236 error = 0; 319 error = 0;
237 for (i = 0; i < bio->bi_vcnt; i++) {
238 struct bio_vec *bvec = &bio->bi_io_vec[i];
239 struct page *page = bvec->bv_page;
240 struct buffer_head *bh, *head;
241 unsigned bio_start = bvec->bv_offset;
242 unsigned bio_end = bio_start + bvec->bv_len;
243 unsigned under_io = 0;
244 unsigned long flags;
245 320
246 if (!page) 321 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
247 continue;
248
249 if (error) {
250 SetPageError(page);
251 set_bit(AS_EIO, &page->mapping->flags);
252 }
253 bh = head = page_buffers(page);
254 /* 322 /*
255 * We check all buffers in the page under BH_Uptodate_Lock 323 * Link bio into list hanging from io_end. We have to do it
256 * to avoid races with other end io clearing async_write flags 324 * atomically as bio completions can be racing against each
325 * other.
257 */ 326 */
258 local_irq_save(flags); 327 bio->bi_private = xchg(&io_end->bio, bio);
259 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 328 } else {
260 do { 329 ext4_finish_bio(bio);
261 if (bh_offset(bh) < bio_start || 330 bio_put(bio);
262 bh_offset(bh) + blocksize > bio_end) {
263 if (buffer_async_write(bh))
264 under_io++;
265 continue;
266 }
267 clear_buffer_async_write(bh);
268 if (error)
269 buffer_io_error(bh);
270 } while ((bh = bh->b_this_page) != head);
271 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
272 local_irq_restore(flags);
273 if (!under_io)
274 end_page_writeback(page);
275 } 331 }
276 bio_put(bio);
277 332
278 if (error) { 333 if (error) {
279 io_end->flag |= EXT4_IO_END_ERROR; 334 struct inode *inode = io_end->inode;
335
280 ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 336 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
281 "(offset %llu size %ld starting block %llu)", 337 "(offset %llu size %ld starting block %llu)",
282 inode->i_ino, 338 inode->i_ino,
@@ -285,13 +341,7 @@ static void ext4_end_bio(struct bio *bio, int error)
285 (unsigned long long) 341 (unsigned long long)
286 bi_sector >> (inode->i_blkbits - 9)); 342 bi_sector >> (inode->i_blkbits - 9));
287 } 343 }
288 344 ext4_put_io_end_defer(io_end);
289 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
290 ext4_free_io_end(io_end);
291 return;
292 }
293
294 ext4_add_complete_io(io_end);
295} 345}
296 346
297void ext4_io_submit(struct ext4_io_submit *io) 347void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +355,38 @@ void ext4_io_submit(struct ext4_io_submit *io)
305 bio_put(io->io_bio); 355 bio_put(io->io_bio);
306 } 356 }
307 io->io_bio = NULL; 357 io->io_bio = NULL;
308 io->io_op = 0; 358}
359
360void ext4_io_submit_init(struct ext4_io_submit *io,
361 struct writeback_control *wbc)
362{
363 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
364 io->io_bio = NULL;
309 io->io_end = NULL; 365 io->io_end = NULL;
310} 366}
311 367
312static int io_submit_init(struct ext4_io_submit *io, 368static int io_submit_init_bio(struct ext4_io_submit *io,
313 struct inode *inode, 369 struct buffer_head *bh)
314 struct writeback_control *wbc,
315 struct buffer_head *bh)
316{ 370{
317 ext4_io_end_t *io_end;
318 struct page *page = bh->b_page;
319 int nvecs = bio_get_nr_vecs(bh->b_bdev); 371 int nvecs = bio_get_nr_vecs(bh->b_bdev);
320 struct bio *bio; 372 struct bio *bio;
321 373
322 io_end = ext4_init_io_end(inode, GFP_NOFS);
323 if (!io_end)
324 return -ENOMEM;
325 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 374 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
375 if (!bio)
376 return -ENOMEM;
326 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 377 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
327 bio->bi_bdev = bh->b_bdev; 378 bio->bi_bdev = bh->b_bdev;
328 bio->bi_private = io->io_end = io_end;
329 bio->bi_end_io = ext4_end_bio; 379 bio->bi_end_io = ext4_end_bio;
330 380 bio->bi_private = ext4_get_io_end(io->io_end);
331 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
332
333 io->io_bio = bio; 381 io->io_bio = bio;
334 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
335 io->io_next_block = bh->b_blocknr; 382 io->io_next_block = bh->b_blocknr;
336 return 0; 383 return 0;
337} 384}
338 385
339static int io_submit_add_bh(struct ext4_io_submit *io, 386static int io_submit_add_bh(struct ext4_io_submit *io,
340 struct inode *inode, 387 struct inode *inode,
341 struct writeback_control *wbc,
342 struct buffer_head *bh) 388 struct buffer_head *bh)
343{ 389{
344 ext4_io_end_t *io_end;
345 int ret; 390 int ret;
346 391
347 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 392 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +394,14 @@ submit_and_retry:
349 ext4_io_submit(io); 394 ext4_io_submit(io);
350 } 395 }
351 if (io->io_bio == NULL) { 396 if (io->io_bio == NULL) {
352 ret = io_submit_init(io, inode, wbc, bh); 397 ret = io_submit_init_bio(io, bh);
353 if (ret) 398 if (ret)
354 return ret; 399 return ret;
355 } 400 }
356 io_end = io->io_end;
357 if (test_clear_buffer_uninit(bh))
358 ext4_set_io_unwritten_flag(inode, io_end);
359 io->io_end->size += bh->b_size;
360 io->io_next_block++;
361 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 401 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
362 if (ret != bh->b_size) 402 if (ret != bh->b_size)
363 goto submit_and_retry; 403 goto submit_and_retry;
404 io->io_next_block++;
364 return 0; 405 return 0;
365} 406}
366 407
@@ -432,7 +473,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
432 do { 473 do {
433 if (!buffer_async_write(bh)) 474 if (!buffer_async_write(bh))
434 continue; 475 continue;
435 ret = io_submit_add_bh(io, inode, wbc, bh); 476 ret = io_submit_add_bh(io, inode, bh);
436 if (ret) { 477 if (ret) {
437 /* 478 /*
438 * We only get here on ENOMEM. Not much else 479 * We only get here on ENOMEM. Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
79 ext4_fsblk_t end = start + input->blocks_count; 79 ext4_fsblk_t end = start + input->blocks_count;
80 ext4_group_t group = input->group; 80 ext4_group_t group = input->group;
81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
82 unsigned overhead = ext4_group_overhead_blocks(sb, group); 82 unsigned overhead;
83 ext4_fsblk_t metaend = start + overhead; 83 ext4_fsblk_t metaend;
84 struct buffer_head *bh = NULL; 84 struct buffer_head *bh = NULL;
85 ext4_grpblk_t free_blocks_count, offset; 85 ext4_grpblk_t free_blocks_count, offset;
86 int err = -EINVAL; 86 int err = -EINVAL;
87 87
88 if (group != sbi->s_groups_count) {
89 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
90 input->group, sbi->s_groups_count);
91 return -EINVAL;
92 }
93
94 overhead = ext4_group_overhead_blocks(sb, group);
95 metaend = start + overhead;
88 input->free_blocks_count = free_blocks_count = 96 input->free_blocks_count = free_blocks_count =
89 input->blocks_count - 2 - overhead - sbi->s_itb_per_group; 97 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
90 98
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
96 free_blocks_count, input->reserved_blocks); 104 free_blocks_count, input->reserved_blocks);
97 105
98 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 106 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
99 if (group != sbi->s_groups_count) 107 if (offset != 0)
100 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
101 input->group, sbi->s_groups_count);
102 else if (offset != 0)
103 ext4_warning(sb, "Last group not full"); 108 ext4_warning(sb, "Last group not full");
104 else if (input->reserved_blocks > input->blocks_count / 5) 109 else if (input->reserved_blocks > input->blocks_count / 5)
105 ext4_warning(sb, "Reserved blocks too high (%u)", 110 ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
1551 int reserved_gdb = ext4_bg_has_super(sb, input->group) ? 1556 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
1552 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1557 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
1553 struct inode *inode = NULL; 1558 struct inode *inode = NULL;
1554 int gdb_off, gdb_num; 1559 int gdb_off;
1555 int err; 1560 int err;
1556 __u16 bg_flags = 0; 1561 __u16 bg_flags = 0;
1557 1562
1558 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
1559 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); 1563 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
1560 1564
1561 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 1565 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
1656 err = err2; 1660 err = err2;
1657 1661
1658 if (!err) { 1662 if (!err) {
1659 ext4_fsblk_t first_block;
1660 first_block = ext4_group_first_block_no(sb, 0);
1661 if (test_opt(sb, DEBUG)) 1663 if (test_opt(sb, DEBUG))
1662 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1664 printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
1663 "blocks\n", ext4_blocks_count(es)); 1665 "blocks\n", ext4_blocks_count(es));
1664 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, 1666 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
1665 (char *)es, sizeof(struct ext4_super_block), 0); 1667 (char *)es, sizeof(struct ext4_super_block), 0);
1666 } 1668 }
1667 return err; 1669 return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..85b3dd60169b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
69static void ext4_clear_journal_err(struct super_block *sb, 69static void ext4_clear_journal_err(struct super_block *sb,
70 struct ext4_super_block *es); 70 struct ext4_super_block *es);
71static int ext4_sync_fs(struct super_block *sb, int wait); 71static int ext4_sync_fs(struct super_block *sb, int wait);
72static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
72static int ext4_remount(struct super_block *sb, int *flags, char *data); 73static int ext4_remount(struct super_block *sb, int *flags, char *data);
73static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 74static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
74static int ext4_unfreeze(struct super_block *sb); 75static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
398 } 399 }
399 if (test_opt(sb, ERRORS_RO)) { 400 if (test_opt(sb, ERRORS_RO)) {
400 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 401 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
402 /*
403 * Make sure updated value of ->s_mount_flags will be visible
404 * before ->s_flags update
405 */
406 smp_wmb();
401 sb->s_flags |= MS_RDONLY; 407 sb->s_flags |= MS_RDONLY;
402 } 408 }
403 if (test_opt(sb, ERRORS_PANIC)) 409 if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
422 ext4_handle_error(sb); 428 ext4_handle_error(sb);
423} 429}
424 430
425void ext4_error_inode(struct inode *inode, const char *function, 431void __ext4_error_inode(struct inode *inode, const char *function,
426 unsigned int line, ext4_fsblk_t block, 432 unsigned int line, ext4_fsblk_t block,
427 const char *fmt, ...) 433 const char *fmt, ...)
428{ 434{
429 va_list args; 435 va_list args;
430 struct va_format vaf; 436 struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
451 ext4_handle_error(inode->i_sb); 457 ext4_handle_error(inode->i_sb);
452} 458}
453 459
454void ext4_error_file(struct file *file, const char *function, 460void __ext4_error_file(struct file *file, const char *function,
455 unsigned int line, ext4_fsblk_t block, 461 unsigned int line, ext4_fsblk_t block,
456 const char *fmt, ...) 462 const char *fmt, ...)
457{ 463{
458 va_list args; 464 va_list args;
459 struct va_format vaf; 465 struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
570 576
571 if ((sb->s_flags & MS_RDONLY) == 0) { 577 if ((sb->s_flags & MS_RDONLY) == 0) {
572 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 578 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
573 sb->s_flags |= MS_RDONLY;
574 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; 579 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
580 /*
581 * Make sure updated value of ->s_mount_flags will be visible
582 * before ->s_flags update
583 */
584 smp_wmb();
585 sb->s_flags |= MS_RDONLY;
575 if (EXT4_SB(sb)->s_journal) 586 if (EXT4_SB(sb)->s_journal)
576 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 587 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
577 save_error_info(sb, function, line); 588 save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
580 panic("EXT4-fs panic from previous error\n"); 591 panic("EXT4-fs panic from previous error\n");
581} 592}
582 593
583void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) 594void __ext4_msg(struct super_block *sb,
595 const char *prefix, const char *fmt, ...)
584{ 596{
585 struct va_format vaf; 597 struct va_format vaf;
586 va_list args; 598 va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
750 ext4_unregister_li_request(sb); 762 ext4_unregister_li_request(sb);
751 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 763 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
752 764
753 flush_workqueue(sbi->dio_unwritten_wq); 765 flush_workqueue(sbi->unrsv_conversion_wq);
754 destroy_workqueue(sbi->dio_unwritten_wq); 766 flush_workqueue(sbi->rsv_conversion_wq);
767 destroy_workqueue(sbi->unrsv_conversion_wq);
768 destroy_workqueue(sbi->rsv_conversion_wq);
755 769
756 if (sbi->s_journal) { 770 if (sbi->s_journal) {
757 err = jbd2_journal_destroy(sbi->s_journal); 771 err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
760 ext4_abort(sb, "Couldn't clean up the journal"); 774 ext4_abort(sb, "Couldn't clean up the journal");
761 } 775 }
762 776
763 ext4_es_unregister_shrinker(sb); 777 ext4_es_unregister_shrinker(sbi);
764 del_timer(&sbi->s_err_report); 778 del_timer(&sbi->s_err_report);
765 ext4_release_system_zone(sb); 779 ext4_release_system_zone(sb);
766 ext4_mb_release(sb); 780 ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
849 rwlock_init(&ei->i_es_lock); 863 rwlock_init(&ei->i_es_lock);
850 INIT_LIST_HEAD(&ei->i_es_lru); 864 INIT_LIST_HEAD(&ei->i_es_lru);
851 ei->i_es_lru_nr = 0; 865 ei->i_es_lru_nr = 0;
866 ei->i_touch_when = 0;
852 ei->i_reserved_data_blocks = 0; 867 ei->i_reserved_data_blocks = 0;
853 ei->i_reserved_meta_blocks = 0; 868 ei->i_reserved_meta_blocks = 0;
854 ei->i_allocated_meta_blocks = 0; 869 ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
859 ei->i_reserved_quota = 0; 874 ei->i_reserved_quota = 0;
860#endif 875#endif
861 ei->jinode = NULL; 876 ei->jinode = NULL;
862 INIT_LIST_HEAD(&ei->i_completed_io_list); 877 INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
878 INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
863 spin_lock_init(&ei->i_completed_io_lock); 879 spin_lock_init(&ei->i_completed_io_lock);
864 ei->i_sync_tid = 0; 880 ei->i_sync_tid = 0;
865 ei->i_datasync_tid = 0; 881 ei->i_datasync_tid = 0;
866 atomic_set(&ei->i_ioend_count, 0); 882 atomic_set(&ei->i_ioend_count, 0);
867 atomic_set(&ei->i_unwritten, 0); 883 atomic_set(&ei->i_unwritten, 0);
868 INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); 884 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
885 INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
869 886
870 return &ei->vfs_inode; 887 return &ei->vfs_inode;
871} 888}
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
1093 .dirty_inode = ext4_dirty_inode, 1110 .dirty_inode = ext4_dirty_inode,
1094 .drop_inode = ext4_drop_inode, 1111 .drop_inode = ext4_drop_inode,
1095 .evict_inode = ext4_evict_inode, 1112 .evict_inode = ext4_evict_inode,
1113 .sync_fs = ext4_sync_fs_nojournal,
1096 .put_super = ext4_put_super, 1114 .put_super = ext4_put_super,
1097 .statfs = ext4_statfs, 1115 .statfs = ext4_statfs,
1098 .remount_fs = ext4_remount, 1116 .remount_fs = ext4_remount,
@@ -1908,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1908 struct ext4_sb_info *sbi = EXT4_SB(sb); 1926 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 struct ext4_group_desc *gdp = NULL; 1927 struct ext4_group_desc *gdp = NULL;
1910 ext4_group_t flex_group; 1928 ext4_group_t flex_group;
1911 unsigned int groups_per_flex = 0;
1912 int i, err; 1929 int i, err;
1913 1930
1914 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1931 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1916 sbi->s_log_groups_per_flex = 0; 1933 sbi->s_log_groups_per_flex = 0;
1917 return 1; 1934 return 1;
1918 } 1935 }
1919 groups_per_flex = 1U << sbi->s_log_groups_per_flex;
1920 1936
1921 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 1937 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1922 if (err) 1938 if (err)
@@ -2164,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2164 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2180 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2165 dquot_initialize(inode); 2181 dquot_initialize(inode);
2166 if (inode->i_nlink) { 2182 if (inode->i_nlink) {
2167 ext4_msg(sb, KERN_DEBUG, 2183 if (test_opt(sb, DEBUG))
2168 "%s: truncating inode %lu to %lld bytes", 2184 ext4_msg(sb, KERN_DEBUG,
2169 __func__, inode->i_ino, inode->i_size); 2185 "%s: truncating inode %lu to %lld bytes",
2186 __func__, inode->i_ino, inode->i_size);
2170 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 2187 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2171 inode->i_ino, inode->i_size); 2188 inode->i_ino, inode->i_size);
2172 mutex_lock(&inode->i_mutex); 2189 mutex_lock(&inode->i_mutex);
2190 truncate_inode_pages(inode->i_mapping, inode->i_size);
2173 ext4_truncate(inode); 2191 ext4_truncate(inode);
2174 mutex_unlock(&inode->i_mutex); 2192 mutex_unlock(&inode->i_mutex);
2175 nr_truncates++; 2193 nr_truncates++;
2176 } else { 2194 } else {
2177 ext4_msg(sb, KERN_DEBUG, 2195 if (test_opt(sb, DEBUG))
2178 "%s: deleting unreferenced inode %lu", 2196 ext4_msg(sb, KERN_DEBUG,
2179 __func__, inode->i_ino); 2197 "%s: deleting unreferenced inode %lu",
2198 __func__, inode->i_ino);
2180 jbd_debug(2, "deleting unreferenced inode %lu\n", 2199 jbd_debug(2, "deleting unreferenced inode %lu\n",
2181 inode->i_ino); 2200 inode->i_ino);
2182 nr_orphans++; 2201 nr_orphans++;
@@ -2377,7 +2396,10 @@ struct ext4_attr {
2377 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2396 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2378 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2397 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2379 const char *, size_t); 2398 const char *, size_t);
2380 int offset; 2399 union {
2400 int offset;
2401 int deprecated_val;
2402 } u;
2381}; 2403};
2382 2404
2383static int parse_strtoull(const char *buf, 2405static int parse_strtoull(const char *buf,
@@ -2446,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2446static ssize_t sbi_ui_show(struct ext4_attr *a, 2468static ssize_t sbi_ui_show(struct ext4_attr *a,
2447 struct ext4_sb_info *sbi, char *buf) 2469 struct ext4_sb_info *sbi, char *buf)
2448{ 2470{
2449 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2471 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2450 2472
2451 return snprintf(buf, PAGE_SIZE, "%u\n", *ui); 2473 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2452} 2474}
@@ -2455,7 +2477,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2455 struct ext4_sb_info *sbi, 2477 struct ext4_sb_info *sbi,
2456 const char *buf, size_t count) 2478 const char *buf, size_t count)
2457{ 2479{
2458 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2480 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2459 unsigned long t; 2481 unsigned long t;
2460 int ret; 2482 int ret;
2461 2483
@@ -2504,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
2504 return count; 2526 return count;
2505} 2527}
2506 2528
2529static ssize_t sbi_deprecated_show(struct ext4_attr *a,
2530 struct ext4_sb_info *sbi, char *buf)
2531{
2532 return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
2533}
2534
2507#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ 2535#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2508static struct ext4_attr ext4_attr_##_name = { \ 2536static struct ext4_attr ext4_attr_##_name = { \
2509 .attr = {.name = __stringify(_name), .mode = _mode }, \ 2537 .attr = {.name = __stringify(_name), .mode = _mode }, \
2510 .show = _show, \ 2538 .show = _show, \
2511 .store = _store, \ 2539 .store = _store, \
2512 .offset = offsetof(struct ext4_sb_info, _elname), \ 2540 .u = { \
2541 .offset = offsetof(struct ext4_sb_info, _elname),\
2542 }, \
2513} 2543}
2514#define EXT4_ATTR(name, mode, show, store) \ 2544#define EXT4_ATTR(name, mode, show, store) \
2515static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2545static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2550,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2520#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2550#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2521 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 2551 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2522#define ATTR_LIST(name) &ext4_attr_##name.attr 2552#define ATTR_LIST(name) &ext4_attr_##name.attr
2553#define EXT4_DEPRECATED_ATTR(_name, _val) \
2554static struct ext4_attr ext4_attr_##_name = { \
2555 .attr = {.name = __stringify(_name), .mode = 0444 }, \
2556 .show = sbi_deprecated_show, \
2557 .u = { \
2558 .deprecated_val = _val, \
2559 }, \
2560}
2523 2561
2524EXT4_RO_ATTR(delayed_allocation_blocks); 2562EXT4_RO_ATTR(delayed_allocation_blocks);
2525EXT4_RO_ATTR(session_write_kbytes); 2563EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2534EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2572EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2535EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2573EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2536EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2574EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2537EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2575EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
2538EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 2576EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2539EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2577EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2540 2578
@@ -3763,7 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3763 sbi->s_err_report.data = (unsigned long) sb; 3801 sbi->s_err_report.data = (unsigned long) sb;
3764 3802
3765 /* Register extent status tree shrinker */ 3803 /* Register extent status tree shrinker */
3766 ext4_es_register_shrinker(sb); 3804 ext4_es_register_shrinker(sbi);
3767 3805
3768 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3806 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3769 ext4_count_free_clusters(sb)); 3807 ext4_count_free_clusters(sb));
@@ -3787,7 +3825,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3787 } 3825 }
3788 3826
3789 sbi->s_stripe = ext4_get_stripe_size(sbi); 3827 sbi->s_stripe = ext4_get_stripe_size(sbi);
3790 sbi->s_max_writeback_mb_bump = 128;
3791 sbi->s_extent_max_zeroout_kb = 32; 3828 sbi->s_extent_max_zeroout_kb = 32;
3792 3829
3793 /* 3830 /*
@@ -3915,12 +3952,20 @@ no_journal:
3915 * The maximum number of concurrent works can be high and 3952 * The maximum number of concurrent works can be high and
3916 * concurrency isn't really necessary. Limit it to 1. 3953 * concurrency isn't really necessary. Limit it to 1.
3917 */ 3954 */
3918 EXT4_SB(sb)->dio_unwritten_wq = 3955 EXT4_SB(sb)->rsv_conversion_wq =
3919 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3956 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3920 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3957 if (!EXT4_SB(sb)->rsv_conversion_wq) {
3921 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3958 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3922 ret = -ENOMEM; 3959 ret = -ENOMEM;
3923 goto failed_mount_wq; 3960 goto failed_mount4;
3961 }
3962
3963 EXT4_SB(sb)->unrsv_conversion_wq =
3964 alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3965 if (!EXT4_SB(sb)->unrsv_conversion_wq) {
3966 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3967 ret = -ENOMEM;
3968 goto failed_mount4;
3924 } 3969 }
3925 3970
3926 /* 3971 /*
@@ -4074,14 +4119,17 @@ failed_mount4a:
4074 sb->s_root = NULL; 4119 sb->s_root = NULL;
4075failed_mount4: 4120failed_mount4:
4076 ext4_msg(sb, KERN_ERR, "mount failed"); 4121 ext4_msg(sb, KERN_ERR, "mount failed");
4077 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 4122 if (EXT4_SB(sb)->rsv_conversion_wq)
4123 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4124 if (EXT4_SB(sb)->unrsv_conversion_wq)
4125 destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4078failed_mount_wq: 4126failed_mount_wq:
4079 if (sbi->s_journal) { 4127 if (sbi->s_journal) {
4080 jbd2_journal_destroy(sbi->s_journal); 4128 jbd2_journal_destroy(sbi->s_journal);
4081 sbi->s_journal = NULL; 4129 sbi->s_journal = NULL;
4082 } 4130 }
4083failed_mount3: 4131failed_mount3:
4084 ext4_es_unregister_shrinker(sb); 4132 ext4_es_unregister_shrinker(sbi);
4085 del_timer(&sbi->s_err_report); 4133 del_timer(&sbi->s_err_report);
4086 if (sbi->s_flex_groups) 4134 if (sbi->s_flex_groups)
4087 ext4_kvfree(sbi->s_flex_groups); 4135 ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4517{ 4565{
4518 int ret = 0; 4566 int ret = 0;
4519 tid_t target; 4567 tid_t target;
4568 bool needs_barrier = false;
4520 struct ext4_sb_info *sbi = EXT4_SB(sb); 4569 struct ext4_sb_info *sbi = EXT4_SB(sb);
4521 4570
4522 trace_ext4_sync_fs(sb, wait); 4571 trace_ext4_sync_fs(sb, wait);
4523 flush_workqueue(sbi->dio_unwritten_wq); 4572 flush_workqueue(sbi->rsv_conversion_wq);
4573 flush_workqueue(sbi->unrsv_conversion_wq);
4524 /* 4574 /*
4525 * Writeback quota in non-journalled quota case - journalled quota has 4575 * Writeback quota in non-journalled quota case - journalled quota has
4526 * no dirty dquots 4576 * no dirty dquots
4527 */ 4577 */
4528 dquot_writeback_dquots(sb, -1); 4578 dquot_writeback_dquots(sb, -1);
4579 /*
4580 * Data writeback is possible w/o journal transaction, so barrier must
4581 * being sent at the end of the function. But we can skip it if
4582 * transaction_commit will do it for us.
4583 */
4584 target = jbd2_get_latest_transaction(sbi->s_journal);
4585 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
4586 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
4587 needs_barrier = true;
4588
4529 if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 4589 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4530 if (wait) 4590 if (wait)
4531 jbd2_log_wait_commit(sbi->s_journal, target); 4591 ret = jbd2_log_wait_commit(sbi->s_journal, target);
4592 }
4593 if (needs_barrier) {
4594 int err;
4595 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4596 if (!ret)
4597 ret = err;
4532 } 4598 }
4599
4600 return ret;
4601}
4602
4603static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
4604{
4605 int ret = 0;
4606
4607 trace_ext4_sync_fs(sb, wait);
4608 flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4609 flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4610 dquot_writeback_dquots(sb, -1);
4611 if (wait && test_opt(sb, BARRIER))
4612 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4613
4533 return ret; 4614 return ret;
4534} 4615}
4535 4616
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b0b0f4..ce11d9a92aed 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -698,7 +698,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
698 get_data_block_ro); 698 get_data_block_ro);
699} 699}
700 700
701static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) 701static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
702 unsigned int length)
702{ 703{
703 struct inode *inode = page->mapping->host; 704 struct inode *inode = page->mapping->host;
704 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 705 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4efd89..74f3c7b03eb2 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1205,7 +1205,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
1205 return 0; 1205 return 0;
1206} 1206}
1207 1207
1208static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) 1208static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1209 unsigned int length)
1209{ 1210{
1210 struct inode *inode = page->mapping->host; 1211 struct inode *inode = page->mapping->host;
1211 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1212 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
110 /* Is the page fully outside i_size? (truncate in progress) */ 110 /* Is the page fully outside i_size? (truncate in progress) */
111 offset = i_size & (PAGE_CACHE_SIZE-1); 111 offset = i_size & (PAGE_CACHE_SIZE-1);
112 if (page->index > end_index || (page->index == end_index && !offset)) { 112 if (page->index > end_index || (page->index == end_index && !offset)) {
113 page->mapping->a_ops->invalidatepage(page, 0); 113 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
114 goto out; 114 goto out;
115 } 115 }
116 return 1; 116 return 1;
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
299 299
300 /* Is the page fully outside i_size? (truncate in progress) */ 300 /* Is the page fully outside i_size? (truncate in progress) */
301 if (page->index > end_index || (page->index == end_index && !offset)) { 301 if (page->index > end_index || (page->index == end_index && !offset)) {
302 page->mapping->a_ops->invalidatepage(page, 0); 302 page->mapping->a_ops->invalidatepage(page, 0,
303 PAGE_CACHE_SIZE);
303 unlock_page(page); 304 unlock_page(page);
304 continue; 305 continue;
305 } 306 }
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
943 unlock_buffer(bh); 944 unlock_buffer(bh);
944} 945}
945 946
946static void gfs2_invalidatepage(struct page *page, unsigned long offset) 947static void gfs2_invalidatepage(struct page *page, unsigned int offset,
948 unsigned int length)
947{ 949{
948 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); 950 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
951 unsigned int stop = offset + length;
952 int partial_page = (offset || length < PAGE_CACHE_SIZE);
949 struct buffer_head *bh, *head; 953 struct buffer_head *bh, *head;
950 unsigned long pos = 0; 954 unsigned long pos = 0;
951 955
952 BUG_ON(!PageLocked(page)); 956 BUG_ON(!PageLocked(page));
953 if (offset == 0) 957 if (!partial_page)
954 ClearPageChecked(page); 958 ClearPageChecked(page);
955 if (!page_has_buffers(page)) 959 if (!page_has_buffers(page))
956 goto out; 960 goto out;
957 961
958 bh = head = page_buffers(page); 962 bh = head = page_buffers(page);
959 do { 963 do {
964 if (pos + bh->b_size > stop)
965 return;
966
960 if (offset <= pos) 967 if (offset <= pos)
961 gfs2_discard(sdp, bh); 968 gfs2_discard(sdp, bh);
962 pos += bh->b_size; 969 pos += bh->b_size;
963 bh = bh->b_this_page; 970 bh = bh->b_this_page;
964 } while (bh != head); 971 } while (bh != head);
965out: 972out:
966 if (offset == 0) 973 if (!partial_page)
967 try_to_release_page(page, 0); 974 try_to_release_page(page, 0);
968} 975}
969 976
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c0a509..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked:
2019 * void journal_invalidatepage() - invalidate a journal page 2019 * void journal_invalidatepage() - invalidate a journal page
2020 * @journal: journal to use for flush 2020 * @journal: journal to use for flush
2021 * @page: page to flush 2021 * @page: page to flush
2022 * @offset: length of page to invalidate. 2022 * @offset: offset of the range to invalidate
2023 * @length: length of the range to invalidate
2023 * 2024 *
2024 * Reap page buffers containing data after offset in page. 2025 * Reap page buffers containing data in specified range in page.
2025 */ 2026 */
2026void journal_invalidatepage(journal_t *journal, 2027void journal_invalidatepage(journal_t *journal,
2027 struct page *page, 2028 struct page *page,
2028 unsigned long offset) 2029 unsigned int offset,
2030 unsigned int length)
2029{ 2031{
2030 struct buffer_head *head, *bh, *next; 2032 struct buffer_head *head, *bh, *next;
2033 unsigned int stop = offset + length;
2031 unsigned int curr_off = 0; 2034 unsigned int curr_off = 0;
2035 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2032 int may_free = 1; 2036 int may_free = 1;
2033 2037
2034 if (!PageLocked(page)) 2038 if (!PageLocked(page))
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
2036 if (!page_has_buffers(page)) 2040 if (!page_has_buffers(page))
2037 return; 2041 return;
2038 2042
2043 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
2044
2039 /* We will potentially be playing with lists other than just the 2045 /* We will potentially be playing with lists other than just the
2040 * data lists (especially for journaled data mode), so be 2046 * data lists (especially for journaled data mode), so be
2041 * cautious in our locking. */ 2047 * cautious in our locking. */
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
2045 unsigned int next_off = curr_off + bh->b_size; 2051 unsigned int next_off = curr_off + bh->b_size;
2046 next = bh->b_this_page; 2052 next = bh->b_this_page;
2047 2053
2054 if (next_off > stop)
2055 return;
2056
2048 if (offset <= curr_off) { 2057 if (offset <= curr_off) {
2049 /* This block is wholly outside the truncation point */ 2058 /* This block is wholly outside the truncation point */
2050 lock_buffer(bh); 2059 lock_buffer(bh);
2051 may_free &= journal_unmap_buffer(journal, bh, 2060 may_free &= journal_unmap_buffer(journal, bh,
2052 offset > 0); 2061 partial_page);
2053 unlock_buffer(bh); 2062 unlock_buffer(bh);
2054 } 2063 }
2055 curr_off = next_off; 2064 curr_off = next_off;
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
2057 2066
2058 } while (bh != head); 2067 } while (bh != head);
2059 2068
2060 if (!offset) { 2069 if (!partial_page) {
2061 if (may_free && try_to_free_buffers(page)) 2070 if (may_free && try_to_free_buffers(page))
2062 J_ASSERT(!page_has_buffers(page)); 2071 J_ASSERT(!page_has_buffers(page));
2063 } 2072 }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
20 20
21config JBD2_DEBUG 21config JBD2_DEBUG
22 bool "JBD2 (ext4) debugging support" 22 bool "JBD2 (ext4) debugging support"
23 depends on JBD2 && DEBUG_FS 23 depends on JBD2
24 help 24 help
25 If you are using the ext4 journaled file system (or 25 If you are using the ext4 journaled file system (or
26 potentially any other filesystem/device using JBD2), this option 26 potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
29 By default, the debugging output will be turned off. 29 By default, the debugging output will be turned off.
30 30
31 If you select Y here, then you will be able to turn on debugging 31 If you select Y here, then you will be able to turn on debugging
32 with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a 32 with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
33 number between 1 and 5. The higher the number, the more debugging 33 number between 1 and 5. The higher the number, the more debugging
34 output is generated. To turn debugging off again, do 34 output is generated. To turn debugging off again, do
35 "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". 35 "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
120 int nblocks, space_left; 120 int nblocks, space_left;
121 /* assert_spin_locked(&journal->j_state_lock); */ 121 /* assert_spin_locked(&journal->j_state_lock); */
122 122
123 nblocks = jbd_space_needed(journal); 123 nblocks = jbd2_space_needed(journal);
124 while (__jbd2_log_space_left(journal) < nblocks) { 124 while (jbd2_log_space_left(journal) < nblocks) {
125 if (journal->j_flags & JBD2_ABORT) 125 if (journal->j_flags & JBD2_ABORT)
126 return; 126 return;
127 write_unlock(&journal->j_state_lock); 127 write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
140 */ 140 */
141 write_lock(&journal->j_state_lock); 141 write_lock(&journal->j_state_lock);
142 spin_lock(&journal->j_list_lock); 142 spin_lock(&journal->j_list_lock);
143 nblocks = jbd_space_needed(journal); 143 nblocks = jbd2_space_needed(journal);
144 space_left = __jbd2_log_space_left(journal); 144 space_left = jbd2_log_space_left(journal);
145 if (space_left < nblocks) { 145 if (space_left < nblocks) {
146 int chkpt = journal->j_checkpoint_transactions != NULL; 146 int chkpt = journal->j_checkpoint_transactions != NULL;
147 tid_t tid = 0; 147 tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
156 /* We were able to recover space; yay! */ 156 /* We were able to recover space; yay! */
157 ; 157 ;
158 } else if (tid) { 158 } else if (tid) {
159 /*
160 * jbd2_journal_commit_transaction() may want
161 * to take the checkpoint_mutex if JBD2_FLUSHED
162 * is set. So we need to temporarily drop it.
163 */
164 mutex_unlock(&journal->j_checkpoint_mutex);
159 jbd2_log_wait_commit(journal, tid); 165 jbd2_log_wait_commit(journal, tid);
166 write_lock(&journal->j_state_lock);
167 continue;
160 } else { 168 } else {
161 printk(KERN_ERR "%s: needed %d blocks and " 169 printk(KERN_ERR "%s: needed %d blocks and "
162 "only had %d space available\n", 170 "only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
625 633
626 __jbd2_journal_drop_transaction(journal, transaction); 634 __jbd2_journal_drop_transaction(journal, transaction);
627 jbd2_journal_free_transaction(transaction); 635 jbd2_journal_free_transaction(transaction);
628
629 /* Just in case anybody was waiting for more transactions to be
630 checkpointed... */
631 wake_up(&journal->j_wait_logspace);
632 ret = 1; 636 ret = 1;
633out: 637out:
634 return ret; 638 return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
690 J_ASSERT(transaction->t_state == T_FINISHED); 694 J_ASSERT(transaction->t_state == T_FINISHED);
691 J_ASSERT(transaction->t_buffers == NULL); 695 J_ASSERT(transaction->t_buffers == NULL);
692 J_ASSERT(transaction->t_forget == NULL); 696 J_ASSERT(transaction->t_forget == NULL);
693 J_ASSERT(transaction->t_iobuf_list == NULL);
694 J_ASSERT(transaction->t_shadow_list == NULL); 697 J_ASSERT(transaction->t_shadow_list == NULL);
695 J_ASSERT(transaction->t_log_list == NULL);
696 J_ASSERT(transaction->t_checkpoint_list == NULL); 698 J_ASSERT(transaction->t_checkpoint_list == NULL);
697 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 699 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
698 J_ASSERT(atomic_read(&transaction->t_updates) == 0); 700 J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946f13c1..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
30#include <trace/events/jbd2.h> 30#include <trace/events/jbd2.h>
31 31
32/* 32/*
33 * Default IO end handler for temporary BJ_IO buffer_heads. 33 * IO end handler for temporary buffer_heads handling writes to the journal.
34 */ 34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{ 36{
37 struct buffer_head *orig_bh = bh->b_private;
38
37 BUFFER_TRACE(bh, ""); 39 BUFFER_TRACE(bh, "");
38 if (uptodate) 40 if (uptodate)
39 set_buffer_uptodate(bh); 41 set_buffer_uptodate(bh);
40 else 42 else
41 clear_buffer_uptodate(bh); 43 clear_buffer_uptodate(bh);
44 if (orig_bh) {
45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 smp_mb__after_clear_bit();
47 wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 }
42 unlock_buffer(bh); 49 unlock_buffer(bh);
43} 50}
44 51
@@ -85,8 +92,7 @@ nope:
85 __brelse(bh); 92 __brelse(bh);
86} 93}
87 94
88static void jbd2_commit_block_csum_set(journal_t *j, 95static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
89 struct journal_head *descriptor)
90{ 96{
91 struct commit_header *h; 97 struct commit_header *h;
92 __u32 csum; 98 __u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
94 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 100 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
95 return; 101 return;
96 102
97 h = (struct commit_header *)(jh2bh(descriptor)->b_data); 103 h = (struct commit_header *)(bh->b_data);
98 h->h_chksum_type = 0; 104 h->h_chksum_type = 0;
99 h->h_chksum_size = 0; 105 h->h_chksum_size = 0;
100 h->h_chksum[0] = 0; 106 h->h_chksum[0] = 0;
101 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 107 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
102 j->j_blocksize);
103 h->h_chksum[0] = cpu_to_be32(csum); 108 h->h_chksum[0] = cpu_to_be32(csum);
104} 109}
105 110
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
116 struct buffer_head **cbh, 121 struct buffer_head **cbh,
117 __u32 crc32_sum) 122 __u32 crc32_sum)
118{ 123{
119 struct journal_head *descriptor;
120 struct commit_header *tmp; 124 struct commit_header *tmp;
121 struct buffer_head *bh; 125 struct buffer_head *bh;
122 int ret; 126 int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
127 if (is_journal_aborted(journal)) 131 if (is_journal_aborted(journal))
128 return 0; 132 return 0;
129 133
130 descriptor = jbd2_journal_get_descriptor_buffer(journal); 134 bh = jbd2_journal_get_descriptor_buffer(journal);
131 if (!descriptor) 135 if (!bh)
132 return 1; 136 return 1;
133 137
134 bh = jh2bh(descriptor);
135
136 tmp = (struct commit_header *)bh->b_data; 138 tmp = (struct commit_header *)bh->b_data;
137 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
138 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
146 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 148 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
147 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 149 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
148 } 150 }
149 jbd2_commit_block_csum_set(journal, descriptor); 151 jbd2_commit_block_csum_set(journal, bh);
150 152
151 JBUFFER_TRACE(descriptor, "submit commit block"); 153 BUFFER_TRACE(bh, "submit commit block");
152 lock_buffer(bh); 154 lock_buffer(bh);
153 clear_buffer_dirty(bh); 155 clear_buffer_dirty(bh);
154 set_buffer_uptodate(bh); 156 set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
180 if (unlikely(!buffer_uptodate(bh))) 182 if (unlikely(!buffer_uptodate(bh)))
181 ret = -EIO; 183 ret = -EIO;
182 put_bh(bh); /* One for getblk() */ 184 put_bh(bh); /* One for getblk() */
183 jbd2_journal_put_journal_head(bh2jh(bh));
184 185
185 return ret; 186 return ret;
186} 187}
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
321} 322}
322 323
323static void jbd2_descr_block_csum_set(journal_t *j, 324static void jbd2_descr_block_csum_set(journal_t *j,
324 struct journal_head *descriptor) 325 struct buffer_head *bh)
325{ 326{
326 struct jbd2_journal_block_tail *tail; 327 struct jbd2_journal_block_tail *tail;
327 __u32 csum; 328 __u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
329 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 330 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
330 return; 331 return;
331 332
332 tail = (struct jbd2_journal_block_tail *) 333 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
333 (jh2bh(descriptor)->b_data + j->j_blocksize -
334 sizeof(struct jbd2_journal_block_tail)); 334 sizeof(struct jbd2_journal_block_tail));
335 tail->t_checksum = 0; 335 tail->t_checksum = 0;
336 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 336 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
337 j->j_blocksize);
338 tail->t_checksum = cpu_to_be32(csum); 337 tail->t_checksum = cpu_to_be32(csum);
339} 338}
340 339
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
343{ 342{
344 struct page *page = bh->b_page; 343 struct page *page = bh->b_page;
345 __u8 *addr; 344 __u8 *addr;
346 __u32 csum; 345 __u32 csum32;
347 346
348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 347 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
349 return; 348 return;
350 349
351 sequence = cpu_to_be32(sequence); 350 sequence = cpu_to_be32(sequence);
352 addr = kmap_atomic(page); 351 addr = kmap_atomic(page);
353 csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
354 sizeof(sequence)); 353 sizeof(sequence));
355 csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), 354 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
356 bh->b_size); 355 bh->b_size);
357 kunmap_atomic(addr); 356 kunmap_atomic(addr);
358 357
359 tag->t_checksum = cpu_to_be32(csum); 358 /* We only have space to store the lower 16 bits of the crc32c. */
359 tag->t_checksum = cpu_to_be16(csum32);
360} 360}
361/* 361/*
362 * jbd2_journal_commit_transaction 362 * jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
368{ 368{
369 struct transaction_stats_s stats; 369 struct transaction_stats_s stats;
370 transaction_t *commit_transaction; 370 transaction_t *commit_transaction;
371 struct journal_head *jh, *new_jh, *descriptor; 371 struct journal_head *jh;
372 struct buffer_head *descriptor;
372 struct buffer_head **wbuf = journal->j_wbuf; 373 struct buffer_head **wbuf = journal->j_wbuf;
373 int bufs; 374 int bufs;
374 int flags; 375 int flags;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
392 tid_t first_tid; 393 tid_t first_tid;
393 int update_tail; 394 int update_tail;
394 int csum_size = 0; 395 int csum_size = 0;
396 LIST_HEAD(io_bufs);
397 LIST_HEAD(log_bufs);
395 398
396 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 399 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
397 csum_size = sizeof(struct jbd2_journal_block_tail); 400 csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
424 J_ASSERT(journal->j_committing_transaction == NULL); 427 J_ASSERT(journal->j_committing_transaction == NULL);
425 428
426 commit_transaction = journal->j_running_transaction; 429 commit_transaction = journal->j_running_transaction;
427 J_ASSERT(commit_transaction->t_state == T_RUNNING);
428 430
429 trace_jbd2_start_commit(journal, commit_transaction); 431 trace_jbd2_start_commit(journal, commit_transaction);
430 jbd_debug(1, "JBD2: starting commit of transaction %d\n", 432 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
431 commit_transaction->t_tid); 433 commit_transaction->t_tid);
432 434
433 write_lock(&journal->j_state_lock); 435 write_lock(&journal->j_state_lock);
436 J_ASSERT(commit_transaction->t_state == T_RUNNING);
434 commit_transaction->t_state = T_LOCKED; 437 commit_transaction->t_state = T_LOCKED;
435 438
436 trace_jbd2_commit_locking(journal, commit_transaction); 439 trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
520 */ 523 */
521 jbd2_journal_switch_revoke_table(journal); 524 jbd2_journal_switch_revoke_table(journal);
522 525
526 /*
527 * Reserved credits cannot be claimed anymore, free them
528 */
529 atomic_sub(atomic_read(&journal->j_reserved_credits),
530 &commit_transaction->t_outstanding_credits);
531
523 trace_jbd2_commit_flushing(journal, commit_transaction); 532 trace_jbd2_commit_flushing(journal, commit_transaction);
524 stats.run.rs_flushing = jiffies; 533 stats.run.rs_flushing = jiffies;
525 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 534 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
533 wake_up(&journal->j_wait_transaction_locked); 542 wake_up(&journal->j_wait_transaction_locked);
534 write_unlock(&journal->j_state_lock); 543 write_unlock(&journal->j_state_lock);
535 544
536 jbd_debug(3, "JBD2: commit phase 2\n"); 545 jbd_debug(3, "JBD2: commit phase 2a\n");
537 546
538 /* 547 /*
539 * Now start flushing things to disk, in the order they appear 548 * Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
545 554
546 blk_start_plug(&plug); 555 blk_start_plug(&plug);
547 jbd2_journal_write_revoke_records(journal, commit_transaction, 556 jbd2_journal_write_revoke_records(journal, commit_transaction,
548 WRITE_SYNC); 557 &log_bufs, WRITE_SYNC);
549 blk_finish_plug(&plug); 558 blk_finish_plug(&plug);
550 559
551 jbd_debug(3, "JBD2: commit phase 2\n"); 560 jbd_debug(3, "JBD2: commit phase 2b\n");
552 561
553 /* 562 /*
554 * Way to go: we have now written out all of the data for a 563 * Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
571 atomic_read(&commit_transaction->t_outstanding_credits)); 580 atomic_read(&commit_transaction->t_outstanding_credits));
572 581
573 err = 0; 582 err = 0;
574 descriptor = NULL;
575 bufs = 0; 583 bufs = 0;
584 descriptor = NULL;
576 blk_start_plug(&plug); 585 blk_start_plug(&plug);
577 while (commit_transaction->t_buffers) { 586 while (commit_transaction->t_buffers) {
578 587
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
604 record the metadata buffer. */ 613 record the metadata buffer. */
605 614
606 if (!descriptor) { 615 if (!descriptor) {
607 struct buffer_head *bh;
608
609 J_ASSERT (bufs == 0); 616 J_ASSERT (bufs == 0);
610 617
611 jbd_debug(4, "JBD2: get descriptor\n"); 618 jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
616 continue; 623 continue;
617 } 624 }
618 625
619 bh = jh2bh(descriptor);
620 jbd_debug(4, "JBD2: got buffer %llu (%p)\n", 626 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
621 (unsigned long long)bh->b_blocknr, bh->b_data); 627 (unsigned long long)descriptor->b_blocknr,
622 header = (journal_header_t *)&bh->b_data[0]; 628 descriptor->b_data);
629 header = (journal_header_t *)descriptor->b_data;
623 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 630 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
624 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); 631 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
625 header->h_sequence = cpu_to_be32(commit_transaction->t_tid); 632 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
626 633
627 tagp = &bh->b_data[sizeof(journal_header_t)]; 634 tagp = &descriptor->b_data[sizeof(journal_header_t)];
628 space_left = bh->b_size - sizeof(journal_header_t); 635 space_left = descriptor->b_size -
636 sizeof(journal_header_t);
629 first_tag = 1; 637 first_tag = 1;
630 set_buffer_jwrite(bh); 638 set_buffer_jwrite(descriptor);
631 set_buffer_dirty(bh); 639 set_buffer_dirty(descriptor);
632 wbuf[bufs++] = bh; 640 wbuf[bufs++] = descriptor;
633 641
634 /* Record it so that we can wait for IO 642 /* Record it so that we can wait for IO
635 completion later */ 643 completion later */
636 BUFFER_TRACE(bh, "ph3: file as descriptor"); 644 BUFFER_TRACE(descriptor, "ph3: file as descriptor");
637 jbd2_journal_file_buffer(descriptor, commit_transaction, 645 jbd2_file_log_bh(&log_bufs, descriptor);
638 BJ_LogCtl);
639 } 646 }
640 647
641 /* Where is the buffer to be written? */ 648 /* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
658 665
659 /* Bump b_count to prevent truncate from stumbling over 666 /* Bump b_count to prevent truncate from stumbling over
660 the shadowed buffer! @@@ This can go if we ever get 667 the shadowed buffer! @@@ This can go if we ever get
661 rid of the BJ_IO/BJ_Shadow pairing of buffers. */ 668 rid of the shadow pairing of buffers. */
662 atomic_inc(&jh2bh(jh)->b_count); 669 atomic_inc(&jh2bh(jh)->b_count);
663 670
664 /* Make a temporary IO buffer with which to write it out
665 (this will requeue both the metadata buffer and the
666 temporary IO buffer). new_bh goes on BJ_IO*/
667
668 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
669 /* 671 /*
670 * akpm: jbd2_journal_write_metadata_buffer() sets 672 * Make a temporary IO buffer with which to write it out
671 * new_bh->b_transaction to commit_transaction. 673 * (this will requeue the metadata buffer to BJ_Shadow).
672 * We need to clean this up before we release new_bh
673 * (which is of type BJ_IO)
674 */ 674 */
675 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
675 JBUFFER_TRACE(jh, "ph3: write metadata"); 676 JBUFFER_TRACE(jh, "ph3: write metadata");
676 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 677 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
677 jh, &new_jh, blocknr); 678 jh, &wbuf[bufs], blocknr);
678 if (flags < 0) { 679 if (flags < 0) {
679 jbd2_journal_abort(journal, flags); 680 jbd2_journal_abort(journal, flags);
680 continue; 681 continue;
681 } 682 }
682 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 683 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
683 wbuf[bufs++] = jh2bh(new_jh);
684 684
685 /* Record the new block's tag in the current descriptor 685 /* Record the new block's tag in the current descriptor
686 buffer */ 686 buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
694 tag = (journal_block_tag_t *) tagp; 694 tag = (journal_block_tag_t *) tagp;
695 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 695 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
696 tag->t_flags = cpu_to_be16(tag_flag); 696 tag->t_flags = cpu_to_be16(tag_flag);
697 jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), 697 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
698 commit_transaction->t_tid); 698 commit_transaction->t_tid);
699 tagp += tag_bytes; 699 tagp += tag_bytes;
700 space_left -= tag_bytes; 700 space_left -= tag_bytes;
701 bufs++;
701 702
702 if (first_tag) { 703 if (first_tag) {
703 memcpy (tagp, journal->j_uuid, 16); 704 memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
809 the log. Before we can commit it, wait for the IO so far to 810 the log. Before we can commit it, wait for the IO so far to
810 complete. Control buffers being written are on the 811 complete. Control buffers being written are on the
811 transaction's t_log_list queue, and metadata buffers are on 812 transaction's t_log_list queue, and metadata buffers are on
812 the t_iobuf_list queue. 813 the io_bufs list.
813 814
814 Wait for the buffers in reverse order. That way we are 815 Wait for the buffers in reverse order. That way we are
815 less likely to be woken up until all IOs have completed, and 816 less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
818 819
819 jbd_debug(3, "JBD2: commit phase 3\n"); 820 jbd_debug(3, "JBD2: commit phase 3\n");
820 821
821 /* 822 while (!list_empty(&io_bufs)) {
822 * akpm: these are BJ_IO, and j_list_lock is not needed. 823 struct buffer_head *bh = list_entry(io_bufs.prev,
823 * See __journal_try_to_free_buffer. 824 struct buffer_head,
824 */ 825 b_assoc_buffers);
825wait_for_iobuf:
826 while (commit_transaction->t_iobuf_list != NULL) {
827 struct buffer_head *bh;
828 826
829 jh = commit_transaction->t_iobuf_list->b_tprev; 827 wait_on_buffer(bh);
830 bh = jh2bh(jh); 828 cond_resched();
831 if (buffer_locked(bh)) {
832 wait_on_buffer(bh);
833 goto wait_for_iobuf;
834 }
835 if (cond_resched())
836 goto wait_for_iobuf;
837 829
838 if (unlikely(!buffer_uptodate(bh))) 830 if (unlikely(!buffer_uptodate(bh)))
839 err = -EIO; 831 err = -EIO;
840 832 jbd2_unfile_log_bh(bh);
841 clear_buffer_jwrite(bh);
842
843 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
844 jbd2_journal_unfile_buffer(journal, jh);
845 833
846 /* 834 /*
847 * ->t_iobuf_list should contain only dummy buffer_heads 835 * The list contains temporary buffer heads created by
848 * which were created by jbd2_journal_write_metadata_buffer(). 836 * jbd2_journal_write_metadata_buffer().
849 */ 837 */
850 BUFFER_TRACE(bh, "dumping temporary bh"); 838 BUFFER_TRACE(bh, "dumping temporary bh");
851 jbd2_journal_put_journal_head(jh);
852 __brelse(bh); 839 __brelse(bh);
853 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 840 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
854 free_buffer_head(bh); 841 free_buffer_head(bh);
855 842
856 /* We also have to unlock and free the corresponding 843 /* We also have to refile the corresponding shadowed buffer */
857 shadowed buffer */
858 jh = commit_transaction->t_shadow_list->b_tprev; 844 jh = commit_transaction->t_shadow_list->b_tprev;
859 bh = jh2bh(jh); 845 bh = jh2bh(jh);
860 clear_bit(BH_JWrite, &bh->b_state); 846 clear_buffer_jwrite(bh);
861 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 847 J_ASSERT_BH(bh, buffer_jbddirty(bh));
848 J_ASSERT_BH(bh, !buffer_shadow(bh));
862 849
863 /* The metadata is now released for reuse, but we need 850 /* The metadata is now released for reuse, but we need
864 to remember it against this transaction so that when 851 to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
866 required. */ 853 required. */
867 JBUFFER_TRACE(jh, "file as BJ_Forget"); 854 JBUFFER_TRACE(jh, "file as BJ_Forget");
868 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 855 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
869 /*
870 * Wake up any transactions which were waiting for this IO to
871 * complete. The barrier must be here so that changes by
872 * jbd2_journal_file_buffer() take effect before wake_up_bit()
873 * does the waitqueue check.
874 */
875 smp_mb();
876 wake_up_bit(&bh->b_state, BH_Unshadow);
877 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 856 JBUFFER_TRACE(jh, "brelse shadowed buffer");
878 __brelse(bh); 857 __brelse(bh);
879 } 858 }
@@ -883,26 +862,19 @@ wait_for_iobuf:
883 jbd_debug(3, "JBD2: commit phase 4\n"); 862 jbd_debug(3, "JBD2: commit phase 4\n");
884 863
885 /* Here we wait for the revoke record and descriptor record buffers */ 864 /* Here we wait for the revoke record and descriptor record buffers */
886 wait_for_ctlbuf: 865 while (!list_empty(&log_bufs)) {
887 while (commit_transaction->t_log_list != NULL) {
888 struct buffer_head *bh; 866 struct buffer_head *bh;
889 867
890 jh = commit_transaction->t_log_list->b_tprev; 868 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
891 bh = jh2bh(jh); 869 wait_on_buffer(bh);
892 if (buffer_locked(bh)) { 870 cond_resched();
893 wait_on_buffer(bh);
894 goto wait_for_ctlbuf;
895 }
896 if (cond_resched())
897 goto wait_for_ctlbuf;
898 871
899 if (unlikely(!buffer_uptodate(bh))) 872 if (unlikely(!buffer_uptodate(bh)))
900 err = -EIO; 873 err = -EIO;
901 874
902 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 875 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
903 clear_buffer_jwrite(bh); 876 clear_buffer_jwrite(bh);
904 jbd2_journal_unfile_buffer(journal, jh); 877 jbd2_unfile_log_bh(bh);
905 jbd2_journal_put_journal_head(jh);
906 __brelse(bh); /* One for getblk */ 878 __brelse(bh); /* One for getblk */
907 /* AKPM: bforget here */ 879 /* AKPM: bforget here */
908 } 880 }
@@ -952,9 +924,7 @@ wait_for_iobuf:
952 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 924 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
953 J_ASSERT(commit_transaction->t_buffers == NULL); 925 J_ASSERT(commit_transaction->t_buffers == NULL);
954 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 926 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
955 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
956 J_ASSERT(commit_transaction->t_shadow_list == NULL); 927 J_ASSERT(commit_transaction->t_shadow_list == NULL);
957 J_ASSERT(commit_transaction->t_log_list == NULL);
958 928
959restart_loop: 929restart_loop:
960 /* 930 /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 95457576e434..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
103static void __journal_abort_soft (journal_t *journal, int errno); 103static void __journal_abort_soft (journal_t *journal, int errno);
104static int jbd2_journal_create_slab(size_t slab_size); 104static int jbd2_journal_create_slab(size_t slab_size);
105 105
106#ifdef CONFIG_JBD2_DEBUG
107void __jbd2_debug(int level, const char *file, const char *func,
108 unsigned int line, const char *fmt, ...)
109{
110 struct va_format vaf;
111 va_list args;
112
113 if (level > jbd2_journal_enable_debug)
114 return;
115 va_start(args, fmt);
116 vaf.fmt = fmt;
117 vaf.va = &args;
118 printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
119 va_end(args);
120}
121EXPORT_SYMBOL(__jbd2_debug);
122#endif
123
106/* Checksumming functions */ 124/* Checksumming functions */
107int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
108{ 126{
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
310 * 328 *
311 * If the source buffer has already been modified by a new transaction 329 * If the source buffer has already been modified by a new transaction
312 * since we took the last commit snapshot, we use the frozen copy of 330 * since we took the last commit snapshot, we use the frozen copy of
313 * that data for IO. If we end up using the existing buffer_head's data 331 * that data for IO. If we end up using the existing buffer_head's data
314 * for the write, then we *have* to lock the buffer to prevent anyone 332 * for the write, then we have to make sure nobody modifies it while the
315 * else from using and possibly modifying it while the IO is in 333 * IO is in progress. do_get_write_access() handles this.
316 * progress.
317 * 334 *
318 * The function returns a pointer to the buffer_heads to be used for IO. 335 * The function returns a pointer to the buffer_head to be used for IO.
319 * 336 *
320 * We assume that the journal has already been locked in this function.
321 * 337 *
322 * Return value: 338 * Return value:
323 * <0: Error 339 * <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
330 346
331int jbd2_journal_write_metadata_buffer(transaction_t *transaction, 347int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
332 struct journal_head *jh_in, 348 struct journal_head *jh_in,
333 struct journal_head **jh_out, 349 struct buffer_head **bh_out,
334 unsigned long long blocknr) 350 sector_t blocknr)
335{ 351{
336 int need_copy_out = 0; 352 int need_copy_out = 0;
337 int done_copy_out = 0; 353 int done_copy_out = 0;
338 int do_escape = 0; 354 int do_escape = 0;
339 char *mapped_data; 355 char *mapped_data;
340 struct buffer_head *new_bh; 356 struct buffer_head *new_bh;
341 struct journal_head *new_jh;
342 struct page *new_page; 357 struct page *new_page;
343 unsigned int new_offset; 358 unsigned int new_offset;
344 struct buffer_head *bh_in = jh2bh(jh_in); 359 struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@ retry_alloc:
368 383
369 /* keep subsequent assertions sane */ 384 /* keep subsequent assertions sane */
370 atomic_set(&new_bh->b_count, 1); 385 atomic_set(&new_bh->b_count, 1);
371 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
372 386
387 jbd_lock_bh_state(bh_in);
388repeat:
373 /* 389 /*
374 * If a new transaction has already done a buffer copy-out, then 390 * If a new transaction has already done a buffer copy-out, then
375 * we use that version of the data for the commit. 391 * we use that version of the data for the commit.
376 */ 392 */
377 jbd_lock_bh_state(bh_in);
378repeat:
379 if (jh_in->b_frozen_data) { 393 if (jh_in->b_frozen_data) {
380 done_copy_out = 1; 394 done_copy_out = 1;
381 new_page = virt_to_page(jh_in->b_frozen_data); 395 new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@ repeat:
415 jbd_unlock_bh_state(bh_in); 429 jbd_unlock_bh_state(bh_in);
416 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 430 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
417 if (!tmp) { 431 if (!tmp) {
418 jbd2_journal_put_journal_head(new_jh); 432 brelse(new_bh);
419 return -ENOMEM; 433 return -ENOMEM;
420 } 434 }
421 jbd_lock_bh_state(bh_in); 435 jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@ repeat:
426 440
427 jh_in->b_frozen_data = tmp; 441 jh_in->b_frozen_data = tmp;
428 mapped_data = kmap_atomic(new_page); 442 mapped_data = kmap_atomic(new_page);
429 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); 443 memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
430 kunmap_atomic(mapped_data); 444 kunmap_atomic(mapped_data);
431 445
432 new_page = virt_to_page(tmp); 446 new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@ repeat:
452 } 466 }
453 467
454 set_bh_page(new_bh, new_page, new_offset); 468 set_bh_page(new_bh, new_page, new_offset);
455 new_jh->b_transaction = NULL; 469 new_bh->b_size = bh_in->b_size;
456 new_bh->b_size = jh2bh(jh_in)->b_size; 470 new_bh->b_bdev = journal->j_dev;
457 new_bh->b_bdev = transaction->t_journal->j_dev;
458 new_bh->b_blocknr = blocknr; 471 new_bh->b_blocknr = blocknr;
472 new_bh->b_private = bh_in;
459 set_buffer_mapped(new_bh); 473 set_buffer_mapped(new_bh);
460 set_buffer_dirty(new_bh); 474 set_buffer_dirty(new_bh);
461 475
462 *jh_out = new_jh; 476 *bh_out = new_bh;
463 477
464 /* 478 /*
465 * The to-be-written buffer needs to get moved to the io queue, 479 * The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@ repeat:
470 spin_lock(&journal->j_list_lock); 484 spin_lock(&journal->j_list_lock);
471 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 485 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
472 spin_unlock(&journal->j_list_lock); 486 spin_unlock(&journal->j_list_lock);
487 set_buffer_shadow(bh_in);
473 jbd_unlock_bh_state(bh_in); 488 jbd_unlock_bh_state(bh_in);
474 489
475 JBUFFER_TRACE(new_jh, "file as BJ_IO");
476 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
477
478 return do_escape | (done_copy_out << 1); 490 return do_escape | (done_copy_out << 1);
479} 491}
480 492
@@ -484,35 +496,6 @@ repeat:
484 */ 496 */
485 497
486/* 498/*
487 * __jbd2_log_space_left: Return the number of free blocks left in the journal.
488 *
489 * Called with the journal already locked.
490 *
491 * Called under j_state_lock
492 */
493
494int __jbd2_log_space_left(journal_t *journal)
495{
496 int left = journal->j_free;
497
498 /* assert_spin_locked(&journal->j_state_lock); */
499
500 /*
501 * Be pessimistic here about the number of those free blocks which
502 * might be required for log descriptor control blocks.
503 */
504
505#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
506
507 left -= MIN_LOG_RESERVED_BLOCKS;
508
509 if (left <= 0)
510 return 0;
511 left -= (left >> 3);
512 return left;
513}
514
515/*
516 * Called with j_state_lock locked for writing. 499 * Called with j_state_lock locked for writing.
517 * Returns true if a transaction commit was started. 500 * Returns true if a transaction commit was started.
518 */ 501 */
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
564} 547}
565 548
566/* 549/*
567 * Force and wait upon a commit if the calling process is not within 550 * Force and wait any uncommitted transactions. We can only force the running
568 * transaction. This is used for forcing out undo-protected data which contains 551 * transaction if we don't have an active handle, otherwise, we will deadlock.
569 * bitmaps, when the fs is running out of space. 552 * Returns: <0 in case of error,
570 * 553 * 0 if nothing to commit,
571 * We can only force the running transaction if we don't have an active handle; 554 * 1 if transaction was successfully committed.
572 * otherwise, we will deadlock.
573 *
574 * Returns true if a transaction was started.
575 */ 555 */
576int jbd2_journal_force_commit_nested(journal_t *journal) 556static int __jbd2_journal_force_commit(journal_t *journal)
577{ 557{
578 transaction_t *transaction = NULL; 558 transaction_t *transaction = NULL;
579 tid_t tid; 559 tid_t tid;
580 int need_to_start = 0; 560 int need_to_start = 0, ret = 0;
581 561
582 read_lock(&journal->j_state_lock); 562 read_lock(&journal->j_state_lock);
583 if (journal->j_running_transaction && !current->journal_info) { 563 if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
588 transaction = journal->j_committing_transaction; 568 transaction = journal->j_committing_transaction;
589 569
590 if (!transaction) { 570 if (!transaction) {
571 /* Nothing to commit */
591 read_unlock(&journal->j_state_lock); 572 read_unlock(&journal->j_state_lock);
592 return 0; /* Nothing to retry */ 573 return 0;
593 } 574 }
594
595 tid = transaction->t_tid; 575 tid = transaction->t_tid;
596 read_unlock(&journal->j_state_lock); 576 read_unlock(&journal->j_state_lock);
597 if (need_to_start) 577 if (need_to_start)
598 jbd2_log_start_commit(journal, tid); 578 jbd2_log_start_commit(journal, tid);
599 jbd2_log_wait_commit(journal, tid); 579 ret = jbd2_log_wait_commit(journal, tid);
600 return 1; 580 if (!ret)
581 ret = 1;
582
583 return ret;
584}
585
586/**
587 * Force and wait upon a commit if the calling process is not within
588 * transaction. This is used for forcing out undo-protected data which contains
589 * bitmaps, when the fs is running out of space.
590 *
591 * @journal: journal to force
592 * Returns true if progress was made.
593 */
594int jbd2_journal_force_commit_nested(journal_t *journal)
595{
596 int ret;
597
598 ret = __jbd2_journal_force_commit(journal);
599 return ret > 0;
600}
601
602/**
603 * int journal_force_commit() - force any uncommitted transactions
604 * @journal: journal to force
605 *
606 * Caller want unconditional commit. We can only force the running transaction
607 * if we don't have an active handle, otherwise, we will deadlock.
608 */
609int jbd2_journal_force_commit(journal_t *journal)
610{
611 int ret;
612
613 J_ASSERT(!current->journal_info);
614 ret = __jbd2_journal_force_commit(journal);
615 if (ret > 0)
616 ret = 0;
617 return ret;
601} 618}
602 619
603/* 620/*
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
798 * But we don't bother doing that, so there will be coherency problems with 815 * But we don't bother doing that, so there will be coherency problems with
799 * mmaps of blockdevs which hold live JBD-controlled filesystems. 816 * mmaps of blockdevs which hold live JBD-controlled filesystems.
800 */ 817 */
801struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) 818struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
802{ 819{
803 struct buffer_head *bh; 820 struct buffer_head *bh;
804 unsigned long long blocknr; 821 unsigned long long blocknr;
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
817 set_buffer_uptodate(bh); 834 set_buffer_uptodate(bh);
818 unlock_buffer(bh); 835 unlock_buffer(bh);
819 BUFFER_TRACE(bh, "return this buffer"); 836 BUFFER_TRACE(bh, "return this buffer");
820 return jbd2_journal_add_journal_head(bh); 837 return bh;
821} 838}
822 839
823/* 840/*
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void)
1062 return NULL; 1079 return NULL;
1063 1080
1064 init_waitqueue_head(&journal->j_wait_transaction_locked); 1081 init_waitqueue_head(&journal->j_wait_transaction_locked);
1065 init_waitqueue_head(&journal->j_wait_logspace);
1066 init_waitqueue_head(&journal->j_wait_done_commit); 1082 init_waitqueue_head(&journal->j_wait_done_commit);
1067 init_waitqueue_head(&journal->j_wait_checkpoint);
1068 init_waitqueue_head(&journal->j_wait_commit); 1083 init_waitqueue_head(&journal->j_wait_commit);
1069 init_waitqueue_head(&journal->j_wait_updates); 1084 init_waitqueue_head(&journal->j_wait_updates);
1085 init_waitqueue_head(&journal->j_wait_reserved);
1070 mutex_init(&journal->j_barrier); 1086 mutex_init(&journal->j_barrier);
1071 mutex_init(&journal->j_checkpoint_mutex); 1087 mutex_init(&journal->j_checkpoint_mutex);
1072 spin_lock_init(&journal->j_revoke_lock); 1088 spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void)
1076 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 1092 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
1077 journal->j_min_batch_time = 0; 1093 journal->j_min_batch_time = 0;
1078 journal->j_max_batch_time = 15000; /* 15ms */ 1094 journal->j_max_batch_time = 15000; /* 15ms */
1095 atomic_set(&journal->j_reserved_credits, 0);
1079 1096
1080 /* The journal is marked for error until we succeed with recovery! */ 1097 /* The journal is marked for error until we succeed with recovery! */
1081 journal->j_flags = JBD2_ABORT; 1098 journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal)
1318static void jbd2_write_superblock(journal_t *journal, int write_op) 1335static void jbd2_write_superblock(journal_t *journal, int write_op)
1319{ 1336{
1320 struct buffer_head *bh = journal->j_sb_buffer; 1337 struct buffer_head *bh = journal->j_sb_buffer;
1338 journal_superblock_t *sb = journal->j_superblock;
1321 int ret; 1339 int ret;
1322 1340
1323 trace_jbd2_write_superblock(journal, write_op); 1341 trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
1339 clear_buffer_write_io_error(bh); 1357 clear_buffer_write_io_error(bh);
1340 set_buffer_uptodate(bh); 1358 set_buffer_uptodate(bh);
1341 } 1359 }
1360 jbd2_superblock_csum_set(journal, sb);
1342 get_bh(bh); 1361 get_bh(bh);
1343 bh->b_end_io = end_buffer_write_sync; 1362 bh->b_end_io = end_buffer_write_sync;
1344 ret = submit_bh(write_op, bh); 1363 ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
1435 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", 1454 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
1436 journal->j_errno); 1455 journal->j_errno);
1437 sb->s_errno = cpu_to_be32(journal->j_errno); 1456 sb->s_errno = cpu_to_be32(journal->j_errno);
1438 jbd2_superblock_csum_set(journal, sb);
1439 read_unlock(&journal->j_state_lock); 1457 read_unlock(&journal->j_state_lock);
1440 1458
1441 jbd2_write_superblock(journal, WRITE_SYNC); 1459 jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
2325#ifdef CONFIG_JBD2_DEBUG 2343#ifdef CONFIG_JBD2_DEBUG
2326 atomic_inc(&nr_journal_heads); 2344 atomic_inc(&nr_journal_heads);
2327#endif 2345#endif
2328 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2346 ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2329 if (!ret) { 2347 if (!ret) {
2330 jbd_debug(1, "out of memory for journal_head\n"); 2348 jbd_debug(1, "out of memory for journal_head\n");
2331 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); 2349 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
2332 while (!ret) { 2350 while (!ret) {
2333 yield(); 2351 yield();
2334 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2352 ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2335 } 2353 }
2336 } 2354 }
2337 return ret; 2355 return ret;
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
2393 struct journal_head *new_jh = NULL; 2411 struct journal_head *new_jh = NULL;
2394 2412
2395repeat: 2413repeat:
2396 if (!buffer_jbd(bh)) { 2414 if (!buffer_jbd(bh))
2397 new_jh = journal_alloc_journal_head(); 2415 new_jh = journal_alloc_journal_head();
2398 memset(new_jh, 0, sizeof(*new_jh));
2399 }
2400 2416
2401 jbd_lock_bh_journal_head(bh); 2417 jbd_lock_bh_journal_head(bh);
2402 if (buffer_jbd(bh)) { 2418 if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, 399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
400 void *buf, __u32 sequence) 400 void *buf, __u32 sequence)
401{ 401{
402 __u32 provided, calculated; 402 __u32 csum32;
403 403
404 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 404 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
405 return 1; 405 return 1;
406 406
407 sequence = cpu_to_be32(sequence); 407 sequence = cpu_to_be32(sequence);
408 calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 408 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
409 sizeof(sequence)); 409 sizeof(sequence));
410 calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); 410 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
411 provided = be32_to_cpu(tag->t_checksum);
412 411
413 return provided == cpu_to_be32(calculated); 412 return tag->t_checksum == cpu_to_be16(csum32);
414} 413}
415 414
416static int do_one_pass(journal_t *journal, 415static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
122 122
123#ifdef __KERNEL__ 123#ifdef __KERNEL__
124static void write_one_revoke_record(journal_t *, transaction_t *, 124static void write_one_revoke_record(journal_t *, transaction_t *,
125 struct journal_head **, int *, 125 struct list_head *,
126 struct buffer_head **, int *,
126 struct jbd2_revoke_record_s *, int); 127 struct jbd2_revoke_record_s *, int);
127static void flush_descriptor(journal_t *, struct journal_head *, int, int); 128static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
128#endif 129#endif
129 130
130/* Utility functions to maintain the revoke table */ 131/* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
531 */ 532 */
532void jbd2_journal_write_revoke_records(journal_t *journal, 533void jbd2_journal_write_revoke_records(journal_t *journal,
533 transaction_t *transaction, 534 transaction_t *transaction,
535 struct list_head *log_bufs,
534 int write_op) 536 int write_op)
535{ 537{
536 struct journal_head *descriptor; 538 struct buffer_head *descriptor;
537 struct jbd2_revoke_record_s *record; 539 struct jbd2_revoke_record_s *record;
538 struct jbd2_revoke_table_s *revoke; 540 struct jbd2_revoke_table_s *revoke;
539 struct list_head *hash_list; 541 struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
553 while (!list_empty(hash_list)) { 555 while (!list_empty(hash_list)) {
554 record = (struct jbd2_revoke_record_s *) 556 record = (struct jbd2_revoke_record_s *)
555 hash_list->next; 557 hash_list->next;
556 write_one_revoke_record(journal, transaction, 558 write_one_revoke_record(journal, transaction, log_bufs,
557 &descriptor, &offset, 559 &descriptor, &offset,
558 record, write_op); 560 record, write_op);
559 count++; 561 count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
573 575
574static void write_one_revoke_record(journal_t *journal, 576static void write_one_revoke_record(journal_t *journal,
575 transaction_t *transaction, 577 transaction_t *transaction,
576 struct journal_head **descriptorp, 578 struct list_head *log_bufs,
579 struct buffer_head **descriptorp,
577 int *offsetp, 580 int *offsetp,
578 struct jbd2_revoke_record_s *record, 581 struct jbd2_revoke_record_s *record,
579 int write_op) 582 int write_op)
580{ 583{
581 int csum_size = 0; 584 int csum_size = 0;
582 struct journal_head *descriptor; 585 struct buffer_head *descriptor;
583 int offset; 586 int offset;
584 journal_header_t *header; 587 journal_header_t *header;
585 588
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
609 descriptor = jbd2_journal_get_descriptor_buffer(journal); 612 descriptor = jbd2_journal_get_descriptor_buffer(journal);
610 if (!descriptor) 613 if (!descriptor)
611 return; 614 return;
612 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; 615 header = (journal_header_t *)descriptor->b_data;
613 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 616 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
614 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); 617 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
615 header->h_sequence = cpu_to_be32(transaction->t_tid); 618 header->h_sequence = cpu_to_be32(transaction->t_tid);
616 619
617 /* Record it so that we can wait for IO completion later */ 620 /* Record it so that we can wait for IO completion later */
618 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); 621 BUFFER_TRACE(descriptor, "file in log_bufs");
619 jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl); 622 jbd2_file_log_bh(log_bufs, descriptor);
620 623
621 offset = sizeof(jbd2_journal_revoke_header_t); 624 offset = sizeof(jbd2_journal_revoke_header_t);
622 *descriptorp = descriptor; 625 *descriptorp = descriptor;
623 } 626 }
624 627
625 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { 628 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
626 * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = 629 * ((__be64 *)(&descriptor->b_data[offset])) =
627 cpu_to_be64(record->blocknr); 630 cpu_to_be64(record->blocknr);
628 offset += 8; 631 offset += 8;
629 632
630 } else { 633 } else {
631 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = 634 * ((__be32 *)(&descriptor->b_data[offset])) =
632 cpu_to_be32(record->blocknr); 635 cpu_to_be32(record->blocknr);
633 offset += 4; 636 offset += 4;
634 } 637 }
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
636 *offsetp = offset; 639 *offsetp = offset;
637} 640}
638 641
639static void jbd2_revoke_csum_set(journal_t *j, 642static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
640 struct journal_head *descriptor)
641{ 643{
642 struct jbd2_journal_revoke_tail *tail; 644 struct jbd2_journal_revoke_tail *tail;
643 __u32 csum; 645 __u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
645 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 647 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
646 return; 648 return;
647 649
648 tail = (struct jbd2_journal_revoke_tail *) 650 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
649 (jh2bh(descriptor)->b_data + j->j_blocksize -
650 sizeof(struct jbd2_journal_revoke_tail)); 651 sizeof(struct jbd2_journal_revoke_tail));
651 tail->r_checksum = 0; 652 tail->r_checksum = 0;
652 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 653 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
653 j->j_blocksize);
654 tail->r_checksum = cpu_to_be32(csum); 654 tail->r_checksum = cpu_to_be32(csum);
655} 655}
656 656
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
662 */ 662 */
663 663
664static void flush_descriptor(journal_t *journal, 664static void flush_descriptor(journal_t *journal,
665 struct journal_head *descriptor, 665 struct buffer_head *descriptor,
666 int offset, int write_op) 666 int offset, int write_op)
667{ 667{
668 jbd2_journal_revoke_header_t *header; 668 jbd2_journal_revoke_header_t *header;
669 struct buffer_head *bh = jh2bh(descriptor);
670 669
671 if (is_journal_aborted(journal)) { 670 if (is_journal_aborted(journal)) {
672 put_bh(bh); 671 put_bh(descriptor);
673 return; 672 return;
674 } 673 }
675 674
676 header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; 675 header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
677 header->r_count = cpu_to_be32(offset); 676 header->r_count = cpu_to_be32(offset);
678 jbd2_revoke_csum_set(journal, descriptor); 677 jbd2_revoke_csum_set(journal, descriptor);
679 678
680 set_buffer_jwrite(bh); 679 set_buffer_jwrite(descriptor);
681 BUFFER_TRACE(bh, "write"); 680 BUFFER_TRACE(descriptor, "write");
682 set_buffer_dirty(bh); 681 set_buffer_dirty(descriptor);
683 write_dirty_buffer(bh, write_op); 682 write_dirty_buffer(descriptor, write_op);
684} 683}
685#endif 684#endif
686 685
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c59ea8..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
89 transaction->t_expires = jiffies + journal->j_commit_interval; 89 transaction->t_expires = jiffies + journal->j_commit_interval;
90 spin_lock_init(&transaction->t_handle_lock); 90 spin_lock_init(&transaction->t_handle_lock);
91 atomic_set(&transaction->t_updates, 0); 91 atomic_set(&transaction->t_updates, 0);
92 atomic_set(&transaction->t_outstanding_credits, 0); 92 atomic_set(&transaction->t_outstanding_credits,
93 atomic_read(&journal->j_reserved_credits));
93 atomic_set(&transaction->t_handle_count, 0); 94 atomic_set(&transaction->t_handle_count, 0);
94 INIT_LIST_HEAD(&transaction->t_inode_list); 95 INIT_LIST_HEAD(&transaction->t_inode_list);
95 INIT_LIST_HEAD(&transaction->t_private_list); 96 INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
141} 142}
142 143
143/* 144/*
145 * Wait until running transaction passes T_LOCKED state. Also starts the commit
146 * if needed. The function expects running transaction to exist and releases
147 * j_state_lock.
148 */
149static void wait_transaction_locked(journal_t *journal)
150 __releases(journal->j_state_lock)
151{
152 DEFINE_WAIT(wait);
153 int need_to_start;
154 tid_t tid = journal->j_running_transaction->t_tid;
155
156 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
157 TASK_UNINTERRUPTIBLE);
158 need_to_start = !tid_geq(journal->j_commit_request, tid);
159 read_unlock(&journal->j_state_lock);
160 if (need_to_start)
161 jbd2_log_start_commit(journal, tid);
162 schedule();
163 finish_wait(&journal->j_wait_transaction_locked, &wait);
164}
165
166static void sub_reserved_credits(journal_t *journal, int blocks)
167{
168 atomic_sub(blocks, &journal->j_reserved_credits);
169 wake_up(&journal->j_wait_reserved);
170}
171
172/*
173 * Wait until we can add credits for handle to the running transaction. Called
174 * with j_state_lock held for reading. Returns 0 if handle joined the running
175 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
176 * caller must retry.
177 */
178static int add_transaction_credits(journal_t *journal, int blocks,
179 int rsv_blocks)
180{
181 transaction_t *t = journal->j_running_transaction;
182 int needed;
183 int total = blocks + rsv_blocks;
184
185 /*
186 * If the current transaction is locked down for commit, wait
187 * for the lock to be released.
188 */
189 if (t->t_state == T_LOCKED) {
190 wait_transaction_locked(journal);
191 return 1;
192 }
193
194 /*
195 * If there is not enough space left in the log to write all
196 * potential buffers requested by this operation, we need to
197 * stall pending a log checkpoint to free some more log space.
198 */
199 needed = atomic_add_return(total, &t->t_outstanding_credits);
200 if (needed > journal->j_max_transaction_buffers) {
201 /*
202 * If the current transaction is already too large,
203 * then start to commit it: we can then go back and
204 * attach this handle to a new transaction.
205 */
206 atomic_sub(total, &t->t_outstanding_credits);
207 wait_transaction_locked(journal);
208 return 1;
209 }
210
211 /*
212 * The commit code assumes that it can get enough log space
213 * without forcing a checkpoint. This is *critical* for
214 * correctness: a checkpoint of a buffer which is also
215 * associated with a committing transaction creates a deadlock,
216 * so commit simply cannot force through checkpoints.
217 *
218 * We must therefore ensure the necessary space in the journal
219 * *before* starting to dirty potentially checkpointed buffers
220 * in the new transaction.
221 */
222 if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
223 atomic_sub(total, &t->t_outstanding_credits);
224 read_unlock(&journal->j_state_lock);
225 write_lock(&journal->j_state_lock);
226 if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
227 __jbd2_log_wait_for_space(journal);
228 write_unlock(&journal->j_state_lock);
229 return 1;
230 }
231
232 /* No reservation? We are done... */
233 if (!rsv_blocks)
234 return 0;
235
236 needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
237 /* We allow at most half of a transaction to be reserved */
238 if (needed > journal->j_max_transaction_buffers / 2) {
239 sub_reserved_credits(journal, rsv_blocks);
240 atomic_sub(total, &t->t_outstanding_credits);
241 read_unlock(&journal->j_state_lock);
242 wait_event(journal->j_wait_reserved,
243 atomic_read(&journal->j_reserved_credits) + rsv_blocks
244 <= journal->j_max_transaction_buffers / 2);
245 return 1;
246 }
247 return 0;
248}
249
250/*
144 * start_this_handle: Given a handle, deal with any locking or stalling 251 * start_this_handle: Given a handle, deal with any locking or stalling
145 * needed to make sure that there is enough journal space for the handle 252 * needed to make sure that there is enough journal space for the handle
146 * to begin. Attach the handle to a transaction and set up the 253 * to begin. Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
151 gfp_t gfp_mask) 258 gfp_t gfp_mask)
152{ 259{
153 transaction_t *transaction, *new_transaction = NULL; 260 transaction_t *transaction, *new_transaction = NULL;
154 tid_t tid; 261 int blocks = handle->h_buffer_credits;
155 int needed, need_to_start; 262 int rsv_blocks = 0;
156 int nblocks = handle->h_buffer_credits;
157 unsigned long ts = jiffies; 263 unsigned long ts = jiffies;
158 264
159 if (nblocks > journal->j_max_transaction_buffers) { 265 /*
266 * 1/2 of transaction can be reserved so we can practically handle
267 * only 1/2 of maximum transaction size per operation
268 */
269 if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
160 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", 270 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
161 current->comm, nblocks, 271 current->comm, blocks,
162 journal->j_max_transaction_buffers); 272 journal->j_max_transaction_buffers / 2);
163 return -ENOSPC; 273 return -ENOSPC;
164 } 274 }
165 275
276 if (handle->h_rsv_handle)
277 rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
278
166alloc_transaction: 279alloc_transaction:
167 if (!journal->j_running_transaction) { 280 if (!journal->j_running_transaction) {
168 new_transaction = kmem_cache_zalloc(transaction_cache, 281 new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
199 return -EROFS; 312 return -EROFS;
200 } 313 }
201 314
202 /* Wait on the journal's transaction barrier if necessary */ 315 /*
203 if (journal->j_barrier_count) { 316 * Wait on the journal's transaction barrier if necessary. Specifically
317 * we allow reserved handles to proceed because otherwise commit could
318 * deadlock on page writeback not being able to complete.
319 */
320 if (!handle->h_reserved && journal->j_barrier_count) {
204 read_unlock(&journal->j_state_lock); 321 read_unlock(&journal->j_state_lock);
205 wait_event(journal->j_wait_transaction_locked, 322 wait_event(journal->j_wait_transaction_locked,
206 journal->j_barrier_count == 0); 323 journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
213 goto alloc_transaction; 330 goto alloc_transaction;
214 write_lock(&journal->j_state_lock); 331 write_lock(&journal->j_state_lock);
215 if (!journal->j_running_transaction && 332 if (!journal->j_running_transaction &&
216 !journal->j_barrier_count) { 333 (handle->h_reserved || !journal->j_barrier_count)) {
217 jbd2_get_transaction(journal, new_transaction); 334 jbd2_get_transaction(journal, new_transaction);
218 new_transaction = NULL; 335 new_transaction = NULL;
219 } 336 }
@@ -223,85 +340,18 @@ repeat:
223 340
224 transaction = journal->j_running_transaction; 341 transaction = journal->j_running_transaction;
225 342
226 /* 343 if (!handle->h_reserved) {
227 * If the current transaction is locked down for commit, wait for the 344 /* We may have dropped j_state_lock - restart in that case */
228 * lock to be released. 345 if (add_transaction_credits(journal, blocks, rsv_blocks))
229 */ 346 goto repeat;
230 if (transaction->t_state == T_LOCKED) { 347 } else {
231 DEFINE_WAIT(wait);
232
233 prepare_to_wait(&journal->j_wait_transaction_locked,
234 &wait, TASK_UNINTERRUPTIBLE);
235 read_unlock(&journal->j_state_lock);
236 schedule();
237 finish_wait(&journal->j_wait_transaction_locked, &wait);
238 goto repeat;
239 }
240
241 /*
242 * If there is not enough space left in the log to write all potential
243 * buffers requested by this operation, we need to stall pending a log
244 * checkpoint to free some more log space.
245 */
246 needed = atomic_add_return(nblocks,
247 &transaction->t_outstanding_credits);
248
249 if (needed > journal->j_max_transaction_buffers) {
250 /* 348 /*
251 * If the current transaction is already too large, then start 349 * We have handle reserved so we are allowed to join T_LOCKED
252 * to commit it: we can then go back and attach this handle to 350 * transaction and we don't have to check for transaction size
253 * a new transaction. 351 * and journal space.
254 */ 352 */
255 DEFINE_WAIT(wait); 353 sub_reserved_credits(journal, blocks);
256 354 handle->h_reserved = 0;
257 jbd_debug(2, "Handle %p starting new commit...\n", handle);
258 atomic_sub(nblocks, &transaction->t_outstanding_credits);
259 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
260 TASK_UNINTERRUPTIBLE);
261 tid = transaction->t_tid;
262 need_to_start = !tid_geq(journal->j_commit_request, tid);
263 read_unlock(&journal->j_state_lock);
264 if (need_to_start)
265 jbd2_log_start_commit(journal, tid);
266 schedule();
267 finish_wait(&journal->j_wait_transaction_locked, &wait);
268 goto repeat;
269 }
270
271 /*
272 * The commit code assumes that it can get enough log space
273 * without forcing a checkpoint. This is *critical* for
274 * correctness: a checkpoint of a buffer which is also
275 * associated with a committing transaction creates a deadlock,
276 * so commit simply cannot force through checkpoints.
277 *
278 * We must therefore ensure the necessary space in the journal
279 * *before* starting to dirty potentially checkpointed buffers
280 * in the new transaction.
281 *
282 * The worst part is, any transaction currently committing can
283 * reduce the free space arbitrarily. Be careful to account for
284 * those buffers when checkpointing.
285 */
286
287 /*
288 * @@@ AKPM: This seems rather over-defensive. We're giving commit
289 * a _lot_ of headroom: 1/4 of the journal plus the size of
290 * the committing transaction. Really, we only need to give it
291 * committing_transaction->t_outstanding_credits plus "enough" for
292 * the log control blocks.
293 * Also, this test is inconsistent with the matching one in
294 * jbd2_journal_extend().
295 */
296 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
297 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
298 atomic_sub(nblocks, &transaction->t_outstanding_credits);
299 read_unlock(&journal->j_state_lock);
300 write_lock(&journal->j_state_lock);
301 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
302 __jbd2_log_wait_for_space(journal);
303 write_unlock(&journal->j_state_lock);
304 goto repeat;
305 } 355 }
306 356
307 /* OK, account for the buffers that this operation expects to 357 /* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
309 */ 359 */
310 update_t_max_wait(transaction, ts); 360 update_t_max_wait(transaction, ts);
311 handle->h_transaction = transaction; 361 handle->h_transaction = transaction;
312 handle->h_requested_credits = nblocks; 362 handle->h_requested_credits = blocks;
313 handle->h_start_jiffies = jiffies; 363 handle->h_start_jiffies = jiffies;
314 atomic_inc(&transaction->t_updates); 364 atomic_inc(&transaction->t_updates);
315 atomic_inc(&transaction->t_handle_count); 365 atomic_inc(&transaction->t_handle_count);
316 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 366 jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
317 handle, nblocks, 367 handle, blocks,
318 atomic_read(&transaction->t_outstanding_credits), 368 atomic_read(&transaction->t_outstanding_credits),
319 __jbd2_log_space_left(journal)); 369 jbd2_log_space_left(journal));
320 read_unlock(&journal->j_state_lock); 370 read_unlock(&journal->j_state_lock);
371 current->journal_info = handle;
321 372
322 lock_map_acquire(&handle->h_lockdep_map); 373 lock_map_acquire(&handle->h_lockdep_map);
323 jbd2_journal_free_transaction(new_transaction); 374 jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks)
348 * 399 *
349 * We make sure that the transaction can guarantee at least nblocks of 400 * We make sure that the transaction can guarantee at least nblocks of
350 * modified buffers in the log. We block until the log can guarantee 401 * modified buffers in the log. We block until the log can guarantee
351 * that much space. 402 * that much space. Additionally, if rsv_blocks > 0, we also create another
352 * 403 * handle with rsv_blocks reserved blocks in the journal. This handle is
353 * This function is visible to journal users (like ext3fs), so is not 404 * is stored in h_rsv_handle. It is not attached to any particular transaction
354 * called with the journal already locked. 405 * and thus doesn't block transaction commit. If the caller uses this reserved
406 * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
407 * on the parent handle will dispose the reserved one. Reserved handle has to
408 * be converted to a normal handle using jbd2_journal_start_reserved() before
409 * it can be used.
355 * 410 *
356 * Return a pointer to a newly allocated handle, or an ERR_PTR() value 411 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
357 * on failure. 412 * on failure.
358 */ 413 */
359handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, 414handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
360 unsigned int type, unsigned int line_no) 415 gfp_t gfp_mask, unsigned int type,
416 unsigned int line_no)
361{ 417{
362 handle_t *handle = journal_current_handle(); 418 handle_t *handle = journal_current_handle();
363 int err; 419 int err;
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
374 handle = new_handle(nblocks); 430 handle = new_handle(nblocks);
375 if (!handle) 431 if (!handle)
376 return ERR_PTR(-ENOMEM); 432 return ERR_PTR(-ENOMEM);
433 if (rsv_blocks) {
434 handle_t *rsv_handle;
377 435
378 current->journal_info = handle; 436 rsv_handle = new_handle(rsv_blocks);
437 if (!rsv_handle) {
438 jbd2_free_handle(handle);
439 return ERR_PTR(-ENOMEM);
440 }
441 rsv_handle->h_reserved = 1;
442 rsv_handle->h_journal = journal;
443 handle->h_rsv_handle = rsv_handle;
444 }
379 445
380 err = start_this_handle(journal, handle, gfp_mask); 446 err = start_this_handle(journal, handle, gfp_mask);
381 if (err < 0) { 447 if (err < 0) {
448 if (handle->h_rsv_handle)
449 jbd2_free_handle(handle->h_rsv_handle);
382 jbd2_free_handle(handle); 450 jbd2_free_handle(handle);
383 current->journal_info = NULL;
384 return ERR_PTR(err); 451 return ERR_PTR(err);
385 } 452 }
386 handle->h_type = type; 453 handle->h_type = type;
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
395 462
396handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 463handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
397{ 464{
398 return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); 465 return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
399} 466}
400EXPORT_SYMBOL(jbd2_journal_start); 467EXPORT_SYMBOL(jbd2_journal_start);
401 468
469void jbd2_journal_free_reserved(handle_t *handle)
470{
471 journal_t *journal = handle->h_journal;
472
473 WARN_ON(!handle->h_reserved);
474 sub_reserved_credits(journal, handle->h_buffer_credits);
475 jbd2_free_handle(handle);
476}
477EXPORT_SYMBOL(jbd2_journal_free_reserved);
478
479/**
480 * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
481 * @handle: handle to start
482 *
483 * Start handle that has been previously reserved with jbd2_journal_reserve().
484 * This attaches @handle to the running transaction (or creates one if there's
485 * not transaction running). Unlike jbd2_journal_start() this function cannot
486 * block on journal commit, checkpointing, or similar stuff. It can block on
487 * memory allocation or frozen journal though.
488 *
489 * Return 0 on success, non-zero on error - handle is freed in that case.
490 */
491int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
492 unsigned int line_no)
493{
494 journal_t *journal = handle->h_journal;
495 int ret = -EIO;
496
497 if (WARN_ON(!handle->h_reserved)) {
498 /* Someone passed in normal handle? Just stop it. */
499 jbd2_journal_stop(handle);
500 return ret;
501 }
502 /*
503 * Usefulness of mixing of reserved and unreserved handles is
504 * questionable. So far nobody seems to need it so just error out.
505 */
506 if (WARN_ON(current->journal_info)) {
507 jbd2_journal_free_reserved(handle);
508 return ret;
509 }
510
511 handle->h_journal = NULL;
512 /*
513 * GFP_NOFS is here because callers are likely from writeback or
514 * similarly constrained call sites
515 */
516 ret = start_this_handle(journal, handle, GFP_NOFS);
517 if (ret < 0)
518 jbd2_journal_free_reserved(handle);
519 handle->h_type = type;
520 handle->h_line_no = line_no;
521 return ret;
522}
523EXPORT_SYMBOL(jbd2_journal_start_reserved);
402 524
403/** 525/**
404 * int jbd2_journal_extend() - extend buffer credits. 526 * int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
423int jbd2_journal_extend(handle_t *handle, int nblocks) 545int jbd2_journal_extend(handle_t *handle, int nblocks)
424{ 546{
425 transaction_t *transaction = handle->h_transaction; 547 transaction_t *transaction = handle->h_transaction;
426 journal_t *journal = transaction->t_journal; 548 journal_t *journal;
427 int result; 549 int result;
428 int wanted; 550 int wanted;
429 551
430 result = -EIO; 552 WARN_ON(!transaction);
431 if (is_handle_aborted(handle)) 553 if (is_handle_aborted(handle))
432 goto out; 554 return -EROFS;
555 journal = transaction->t_journal;
433 556
434 result = 1; 557 result = 1;
435 558
436 read_lock(&journal->j_state_lock); 559 read_lock(&journal->j_state_lock);
437 560
438 /* Don't extend a locked-down transaction! */ 561 /* Don't extend a locked-down transaction! */
439 if (handle->h_transaction->t_state != T_RUNNING) { 562 if (transaction->t_state != T_RUNNING) {
440 jbd_debug(3, "denied handle %p %d blocks: " 563 jbd_debug(3, "denied handle %p %d blocks: "
441 "transaction not running\n", handle, nblocks); 564 "transaction not running\n", handle, nblocks);
442 goto error_out; 565 goto error_out;
443 } 566 }
444 567
445 spin_lock(&transaction->t_handle_lock); 568 spin_lock(&transaction->t_handle_lock);
446 wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; 569 wanted = atomic_add_return(nblocks,
570 &transaction->t_outstanding_credits);
447 571
448 if (wanted > journal->j_max_transaction_buffers) { 572 if (wanted > journal->j_max_transaction_buffers) {
449 jbd_debug(3, "denied handle %p %d blocks: " 573 jbd_debug(3, "denied handle %p %d blocks: "
450 "transaction too large\n", handle, nblocks); 574 "transaction too large\n", handle, nblocks);
575 atomic_sub(nblocks, &transaction->t_outstanding_credits);
451 goto unlock; 576 goto unlock;
452 } 577 }
453 578
454 if (wanted > __jbd2_log_space_left(journal)) { 579 if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
580 jbd2_log_space_left(journal)) {
455 jbd_debug(3, "denied handle %p %d blocks: " 581 jbd_debug(3, "denied handle %p %d blocks: "
456 "insufficient log space\n", handle, nblocks); 582 "insufficient log space\n", handle, nblocks);
583 atomic_sub(nblocks, &transaction->t_outstanding_credits);
457 goto unlock; 584 goto unlock;
458 } 585 }
459 586
460 trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, 587 trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
461 handle->h_transaction->t_tid, 588 transaction->t_tid,
462 handle->h_type, handle->h_line_no, 589 handle->h_type, handle->h_line_no,
463 handle->h_buffer_credits, 590 handle->h_buffer_credits,
464 nblocks); 591 nblocks);
465 592
466 handle->h_buffer_credits += nblocks; 593 handle->h_buffer_credits += nblocks;
467 handle->h_requested_credits += nblocks; 594 handle->h_requested_credits += nblocks;
468 atomic_add(nblocks, &transaction->t_outstanding_credits);
469 result = 0; 595 result = 0;
470 596
471 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); 597 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@ unlock:
473 spin_unlock(&transaction->t_handle_lock); 599 spin_unlock(&transaction->t_handle_lock);
474error_out: 600error_out:
475 read_unlock(&journal->j_state_lock); 601 read_unlock(&journal->j_state_lock);
476out:
477 return result; 602 return result;
478} 603}
479 604
@@ -490,19 +615,22 @@ out:
490 * to a running handle, a call to jbd2_journal_restart will commit the 615 * to a running handle, a call to jbd2_journal_restart will commit the
491 * handle's transaction so far and reattach the handle to a new 616 * handle's transaction so far and reattach the handle to a new
492 * transaction capabable of guaranteeing the requested number of 617 * transaction capabable of guaranteeing the requested number of
493 * credits. 618 * credits. We preserve reserved handle if there's any attached to the
619 * passed in handle.
494 */ 620 */
495int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) 621int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
496{ 622{
497 transaction_t *transaction = handle->h_transaction; 623 transaction_t *transaction = handle->h_transaction;
498 journal_t *journal = transaction->t_journal; 624 journal_t *journal;
499 tid_t tid; 625 tid_t tid;
500 int need_to_start, ret; 626 int need_to_start, ret;
501 627
628 WARN_ON(!transaction);
502 /* If we've had an abort of any type, don't even think about 629 /* If we've had an abort of any type, don't even think about
503 * actually doing the restart! */ 630 * actually doing the restart! */
504 if (is_handle_aborted(handle)) 631 if (is_handle_aborted(handle))
505 return 0; 632 return 0;
633 journal = transaction->t_journal;
506 634
507 /* 635 /*
508 * First unlink the handle from its current transaction, and start the 636 * First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
515 spin_lock(&transaction->t_handle_lock); 643 spin_lock(&transaction->t_handle_lock);
516 atomic_sub(handle->h_buffer_credits, 644 atomic_sub(handle->h_buffer_credits,
517 &transaction->t_outstanding_credits); 645 &transaction->t_outstanding_credits);
646 if (handle->h_rsv_handle) {
647 sub_reserved_credits(journal,
648 handle->h_rsv_handle->h_buffer_credits);
649 }
518 if (atomic_dec_and_test(&transaction->t_updates)) 650 if (atomic_dec_and_test(&transaction->t_updates))
519 wake_up(&journal->j_wait_updates); 651 wake_up(&journal->j_wait_updates);
652 tid = transaction->t_tid;
520 spin_unlock(&transaction->t_handle_lock); 653 spin_unlock(&transaction->t_handle_lock);
654 handle->h_transaction = NULL;
655 current->journal_info = NULL;
521 656
522 jbd_debug(2, "restarting handle %p\n", handle); 657 jbd_debug(2, "restarting handle %p\n", handle);
523 tid = transaction->t_tid;
524 need_to_start = !tid_geq(journal->j_commit_request, tid); 658 need_to_start = !tid_geq(journal->j_commit_request, tid);
525 read_unlock(&journal->j_state_lock); 659 read_unlock(&journal->j_state_lock);
526 if (need_to_start) 660 if (need_to_start)
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
557 write_lock(&journal->j_state_lock); 691 write_lock(&journal->j_state_lock);
558 ++journal->j_barrier_count; 692 ++journal->j_barrier_count;
559 693
694 /* Wait until there are no reserved handles */
695 if (atomic_read(&journal->j_reserved_credits)) {
696 write_unlock(&journal->j_state_lock);
697 wait_event(journal->j_wait_reserved,
698 atomic_read(&journal->j_reserved_credits) == 0);
699 write_lock(&journal->j_state_lock);
700 }
701
560 /* Wait until there are no running updates */ 702 /* Wait until there are no running updates */
561 while (1) { 703 while (1) {
562 transaction_t *transaction = journal->j_running_transaction; 704 transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
619 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 761 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
620} 762}
621 763
764static int sleep_on_shadow_bh(void *word)
765{
766 io_schedule();
767 return 0;
768}
769
622/* 770/*
623 * If the buffer is already part of the current transaction, then there 771 * If the buffer is already part of the current transaction, then there
624 * is nothing we need to do. If it is already part of a prior 772 * is nothing we need to do. If it is already part of a prior
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
634 int force_copy) 782 int force_copy)
635{ 783{
636 struct buffer_head *bh; 784 struct buffer_head *bh;
637 transaction_t *transaction; 785 transaction_t *transaction = handle->h_transaction;
638 journal_t *journal; 786 journal_t *journal;
639 int error; 787 int error;
640 char *frozen_buffer = NULL; 788 char *frozen_buffer = NULL;
641 int need_copy = 0; 789 int need_copy = 0;
642 unsigned long start_lock, time_lock; 790 unsigned long start_lock, time_lock;
643 791
792 WARN_ON(!transaction);
644 if (is_handle_aborted(handle)) 793 if (is_handle_aborted(handle))
645 return -EROFS; 794 return -EROFS;
646
647 transaction = handle->h_transaction;
648 journal = transaction->t_journal; 795 journal = transaction->t_journal;
649 796
650 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); 797 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@ repeat:
754 * journaled. If the primary copy is already going to 901 * journaled. If the primary copy is already going to
755 * disk then we cannot do copy-out here. */ 902 * disk then we cannot do copy-out here. */
756 903
757 if (jh->b_jlist == BJ_Shadow) { 904 if (buffer_shadow(bh)) {
758 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
759 wait_queue_head_t *wqh;
760
761 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
762
763 JBUFFER_TRACE(jh, "on shadow: sleep"); 905 JBUFFER_TRACE(jh, "on shadow: sleep");
764 jbd_unlock_bh_state(bh); 906 jbd_unlock_bh_state(bh);
765 /* commit wakes up all shadow buffers after IO */ 907 wait_on_bit(&bh->b_state, BH_Shadow,
766 for ( ; ; ) { 908 sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
767 prepare_to_wait(wqh, &wait.wait,
768 TASK_UNINTERRUPTIBLE);
769 if (jh->b_jlist != BJ_Shadow)
770 break;
771 schedule();
772 }
773 finish_wait(wqh, &wait.wait);
774 goto repeat; 909 goto repeat;
775 } 910 }
776 911
777 /* Only do the copy if the currently-owning transaction 912 /*
778 * still needs it. If it is on the Forget list, the 913 * Only do the copy if the currently-owning transaction still
779 * committing transaction is past that stage. The 914 * needs it. If buffer isn't on BJ_Metadata list, the
780 * buffer had better remain locked during the kmalloc, 915 * committing transaction is past that stage (here we use the
781 * but that should be true --- we hold the journal lock 916 * fact that BH_Shadow is set under bh_state lock together with
782 * still and the buffer is already on the BUF_JOURNAL 917 * refiling to BJ_Shadow list and at this point we know the
783 * list so won't be flushed. 918 * buffer doesn't have BH_Shadow set).
784 * 919 *
785 * Subtle point, though: if this is a get_undo_access, 920 * Subtle point, though: if this is a get_undo_access,
786 * then we will be relying on the frozen_data to contain 921 * then we will be relying on the frozen_data to contain
787 * the new value of the committed_data record after the 922 * the new value of the committed_data record after the
788 * transaction, so we HAVE to force the frozen_data copy 923 * transaction, so we HAVE to force the frozen_data copy
789 * in that case. */ 924 * in that case.
790 925 */
791 if (jh->b_jlist != BJ_Forget || force_copy) { 926 if (jh->b_jlist == BJ_Metadata || force_copy) {
792 JBUFFER_TRACE(jh, "generate frozen data"); 927 JBUFFER_TRACE(jh, "generate frozen data");
793 if (!frozen_buffer) { 928 if (!frozen_buffer) {
794 JBUFFER_TRACE(jh, "allocate memory for buffer"); 929 JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
915int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) 1050int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
916{ 1051{
917 transaction_t *transaction = handle->h_transaction; 1052 transaction_t *transaction = handle->h_transaction;
918 journal_t *journal = transaction->t_journal; 1053 journal_t *journal;
919 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 1054 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
920 int err; 1055 int err;
921 1056
922 jbd_debug(5, "journal_head %p\n", jh); 1057 jbd_debug(5, "journal_head %p\n", jh);
1058 WARN_ON(!transaction);
923 err = -EROFS; 1059 err = -EROFS;
924 if (is_handle_aborted(handle)) 1060 if (is_handle_aborted(handle))
925 goto out; 1061 goto out;
1062 journal = transaction->t_journal;
926 err = 0; 1063 err = 0;
927 1064
928 JBUFFER_TRACE(jh, "entry"); 1065 JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
1128int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) 1265int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1129{ 1266{
1130 transaction_t *transaction = handle->h_transaction; 1267 transaction_t *transaction = handle->h_transaction;
1131 journal_t *journal = transaction->t_journal; 1268 journal_t *journal;
1132 struct journal_head *jh; 1269 struct journal_head *jh;
1133 int ret = 0; 1270 int ret = 0;
1134 1271
1272 WARN_ON(!transaction);
1135 if (is_handle_aborted(handle)) 1273 if (is_handle_aborted(handle))
1136 goto out; 1274 return -EROFS;
1275 journal = transaction->t_journal;
1137 jh = jbd2_journal_grab_journal_head(bh); 1276 jh = jbd2_journal_grab_journal_head(bh);
1138 if (!jh) { 1277 if (!jh) {
1139 ret = -EUCLEAN; 1278 ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1227 1366
1228 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1367 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1229 spin_lock(&journal->j_list_lock); 1368 spin_lock(&journal->j_list_lock);
1230 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); 1369 __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
1231 spin_unlock(&journal->j_list_lock); 1370 spin_unlock(&journal->j_list_lock);
1232out_unlock_bh: 1371out_unlock_bh:
1233 jbd_unlock_bh_state(bh); 1372 jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@ out:
1258int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) 1397int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1259{ 1398{
1260 transaction_t *transaction = handle->h_transaction; 1399 transaction_t *transaction = handle->h_transaction;
1261 journal_t *journal = transaction->t_journal; 1400 journal_t *journal;
1262 struct journal_head *jh; 1401 struct journal_head *jh;
1263 int drop_reserve = 0; 1402 int drop_reserve = 0;
1264 int err = 0; 1403 int err = 0;
1265 int was_modified = 0; 1404 int was_modified = 0;
1266 1405
1406 WARN_ON(!transaction);
1407 if (is_handle_aborted(handle))
1408 return -EROFS;
1409 journal = transaction->t_journal;
1410
1267 BUFFER_TRACE(bh, "entry"); 1411 BUFFER_TRACE(bh, "entry");
1268 1412
1269 jbd_lock_bh_state(bh); 1413 jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1290 */ 1434 */
1291 jh->b_modified = 0; 1435 jh->b_modified = 0;
1292 1436
1293 if (jh->b_transaction == handle->h_transaction) { 1437 if (jh->b_transaction == transaction) {
1294 J_ASSERT_JH(jh, !jh->b_frozen_data); 1438 J_ASSERT_JH(jh, !jh->b_frozen_data);
1295 1439
1296 /* If we are forgetting a buffer which is already part 1440 /* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@ drop:
1385int jbd2_journal_stop(handle_t *handle) 1529int jbd2_journal_stop(handle_t *handle)
1386{ 1530{
1387 transaction_t *transaction = handle->h_transaction; 1531 transaction_t *transaction = handle->h_transaction;
1388 journal_t *journal = transaction->t_journal; 1532 journal_t *journal;
1389 int err, wait_for_commit = 0; 1533 int err = 0, wait_for_commit = 0;
1390 tid_t tid; 1534 tid_t tid;
1391 pid_t pid; 1535 pid_t pid;
1392 1536
1537 if (!transaction)
1538 goto free_and_exit;
1539 journal = transaction->t_journal;
1540
1393 J_ASSERT(journal_current_handle() == handle); 1541 J_ASSERT(journal_current_handle() == handle);
1394 1542
1395 if (is_handle_aborted(handle)) 1543 if (is_handle_aborted(handle))
1396 err = -EIO; 1544 err = -EIO;
1397 else { 1545 else
1398 J_ASSERT(atomic_read(&transaction->t_updates) > 0); 1546 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
1399 err = 0;
1400 }
1401 1547
1402 if (--handle->h_ref > 0) { 1548 if (--handle->h_ref > 0) {
1403 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, 1549 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
1407 1553
1408 jbd_debug(4, "Handle %p going down\n", handle); 1554 jbd_debug(4, "Handle %p going down\n", handle);
1409 trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, 1555 trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
1410 handle->h_transaction->t_tid, 1556 transaction->t_tid,
1411 handle->h_type, handle->h_line_no, 1557 handle->h_type, handle->h_line_no,
1412 jiffies - handle->h_start_jiffies, 1558 jiffies - handle->h_start_jiffies,
1413 handle->h_sync, handle->h_requested_credits, 1559 handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
1518 1664
1519 lock_map_release(&handle->h_lockdep_map); 1665 lock_map_release(&handle->h_lockdep_map);
1520 1666
1667 if (handle->h_rsv_handle)
1668 jbd2_journal_free_reserved(handle->h_rsv_handle);
1669free_and_exit:
1521 jbd2_free_handle(handle); 1670 jbd2_free_handle(handle);
1522 return err; 1671 return err;
1523} 1672}
1524 1673
1525/**
1526 * int jbd2_journal_force_commit() - force any uncommitted transactions
1527 * @journal: journal to force
1528 *
1529 * For synchronous operations: force any uncommitted transactions
1530 * to disk. May seem kludgy, but it reuses all the handle batching
1531 * code in a very simple manner.
1532 */
1533int jbd2_journal_force_commit(journal_t *journal)
1534{
1535 handle_t *handle;
1536 int ret;
1537
1538 handle = jbd2_journal_start(journal, 1);
1539 if (IS_ERR(handle)) {
1540 ret = PTR_ERR(handle);
1541 } else {
1542 handle->h_sync = 1;
1543 ret = jbd2_journal_stop(handle);
1544 }
1545 return ret;
1546}
1547
1548/* 1674/*
1549 * 1675 *
1550 * List management code snippets: various functions for manipulating the 1676 * List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1601 * Remove a buffer from the appropriate transaction list. 1727 * Remove a buffer from the appropriate transaction list.
1602 * 1728 *
1603 * Note that this function can *change* the value of 1729 * Note that this function can *change* the value of
1604 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, 1730 * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
1605 * t_log_list or t_reserved_list. If the caller is holding onto a copy of one 1731 * t_reserved_list. If the caller is holding onto a copy of one of these
1606 * of these pointers, it could go bad. Generally the caller needs to re-read 1732 * pointers, it could go bad. Generally the caller needs to re-read the
1607 * the pointer from the transaction_t. 1733 * pointer from the transaction_t.
1608 * 1734 *
1609 * Called under j_list_lock. 1735 * Called under j_list_lock.
1610 */ 1736 */
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1634 case BJ_Forget: 1760 case BJ_Forget:
1635 list = &transaction->t_forget; 1761 list = &transaction->t_forget;
1636 break; 1762 break;
1637 case BJ_IO:
1638 list = &transaction->t_iobuf_list;
1639 break;
1640 case BJ_Shadow: 1763 case BJ_Shadow:
1641 list = &transaction->t_shadow_list; 1764 list = &transaction->t_shadow_list;
1642 break; 1765 break;
1643 case BJ_LogCtl:
1644 list = &transaction->t_log_list;
1645 break;
1646 case BJ_Reserved: 1766 case BJ_Reserved:
1647 list = &transaction->t_reserved_list; 1767 list = &transaction->t_reserved_list;
1648 break; 1768 break;
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked:
2034 * void jbd2_journal_invalidatepage() 2154 * void jbd2_journal_invalidatepage()
2035 * @journal: journal to use for flush... 2155 * @journal: journal to use for flush...
2036 * @page: page to flush 2156 * @page: page to flush
2037 * @offset: length of page to invalidate. 2157 * @offset: start of the range to invalidate
2158 * @length: length of the range to invalidate
2038 * 2159 *
2039 * Reap page buffers containing data after offset in page. Can return -EBUSY 2160 * Reap page buffers containing data after in the specified range in page.
2040 * if buffers are part of the committing transaction and the page is straddling 2161 * Can return -EBUSY if buffers are part of the committing transaction and
2041 * i_size. Caller then has to wait for current commit and try again. 2162 * the page is straddling i_size. Caller then has to wait for current commit
2163 * and try again.
2042 */ 2164 */
2043int jbd2_journal_invalidatepage(journal_t *journal, 2165int jbd2_journal_invalidatepage(journal_t *journal,
2044 struct page *page, 2166 struct page *page,
2045 unsigned long offset) 2167 unsigned int offset,
2168 unsigned int length)
2046{ 2169{
2047 struct buffer_head *head, *bh, *next; 2170 struct buffer_head *head, *bh, *next;
2171 unsigned int stop = offset + length;
2048 unsigned int curr_off = 0; 2172 unsigned int curr_off = 0;
2173 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2049 int may_free = 1; 2174 int may_free = 1;
2050 int ret = 0; 2175 int ret = 0;
2051 2176
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2054 if (!page_has_buffers(page)) 2179 if (!page_has_buffers(page))
2055 return 0; 2180 return 0;
2056 2181
2182 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
2183
2057 /* We will potentially be playing with lists other than just the 2184 /* We will potentially be playing with lists other than just the
2058 * data lists (especially for journaled data mode), so be 2185 * data lists (especially for journaled data mode), so be
2059 * cautious in our locking. */ 2186 * cautious in our locking. */
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2063 unsigned int next_off = curr_off + bh->b_size; 2190 unsigned int next_off = curr_off + bh->b_size;
2064 next = bh->b_this_page; 2191 next = bh->b_this_page;
2065 2192
2193 if (next_off > stop)
2194 return 0;
2195
2066 if (offset <= curr_off) { 2196 if (offset <= curr_off) {
2067 /* This block is wholly outside the truncation point */ 2197 /* This block is wholly outside the truncation point */
2068 lock_buffer(bh); 2198 lock_buffer(bh);
2069 ret = journal_unmap_buffer(journal, bh, offset > 0); 2199 ret = journal_unmap_buffer(journal, bh, partial_page);
2070 unlock_buffer(bh); 2200 unlock_buffer(bh);
2071 if (ret < 0) 2201 if (ret < 0)
2072 return ret; 2202 return ret;
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2077 2207
2078 } while (bh != head); 2208 } while (bh != head);
2079 2209
2080 if (!offset) { 2210 if (!partial_page) {
2081 if (may_free && try_to_free_buffers(page)) 2211 if (may_free && try_to_free_buffers(page))
2082 J_ASSERT(!page_has_buffers(page)); 2212 J_ASSERT(!page_has_buffers(page));
2083 } 2213 }
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2138 case BJ_Forget: 2268 case BJ_Forget:
2139 list = &transaction->t_forget; 2269 list = &transaction->t_forget;
2140 break; 2270 break;
2141 case BJ_IO:
2142 list = &transaction->t_iobuf_list;
2143 break;
2144 case BJ_Shadow: 2271 case BJ_Shadow:
2145 list = &transaction->t_shadow_list; 2272 list = &transaction->t_shadow_list;
2146 break; 2273 break;
2147 case BJ_LogCtl:
2148 list = &transaction->t_log_list;
2149 break;
2150 case BJ_Reserved: 2274 case BJ_Reserved:
2151 list = &transaction->t_reserved_list; 2275 list = &transaction->t_reserved_list;
2152 break; 2276 break;
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2248int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) 2372int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2249{ 2373{
2250 transaction_t *transaction = handle->h_transaction; 2374 transaction_t *transaction = handle->h_transaction;
2251 journal_t *journal = transaction->t_journal; 2375 journal_t *journal;
2252 2376
2377 WARN_ON(!transaction);
2253 if (is_handle_aborted(handle)) 2378 if (is_handle_aborted(handle))
2254 return -EIO; 2379 return -EROFS;
2380 journal = transaction->t_journal;
2255 2381
2256 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, 2382 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2257 transaction->t_tid); 2383 transaction->t_tid);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..9e3aaff11f89 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
571 return ret; 571 return ret;
572} 572}
573 573
574static void metapage_invalidatepage(struct page *page, unsigned long offset) 574static void metapage_invalidatepage(struct page *page, unsigned int offset,
575 unsigned int length)
575{ 576{
576 BUG_ON(offset); 577 BUG_ON(offset || length < PAGE_CACHE_SIZE);
577 578
578 BUG_ON(PageWriteback(page)); 579 BUG_ON(PageWriteback(page));
579 580
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
159 return __logfs_writepage(page); 159 return __logfs_writepage(page);
160} 160}
161 161
162static void logfs_invalidatepage(struct page *page, unsigned long offset) 162static void logfs_invalidatepage(struct page *page, unsigned int offset,
163 unsigned int length)
163{ 164{
164 struct logfs_block *block = logfs_block(page); 165 struct logfs_block *block = logfs_block(page);
165 166
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
884 return area; 884 return area;
885} 885}
886 886
887static void map_invalidatepage(struct page *page, unsigned long l) 887static void map_invalidatepage(struct page *page, unsigned int o,
888 unsigned int l)
888{ 889{
889 return; 890 return;
890} 891}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f84113..6b4a79f4ad1d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
451 * - Called if either PG_private or PG_fscache is set on the page 451 * - Called if either PG_private or PG_fscache is set on the page
452 * - Caller holds page lock 452 * - Caller holds page lock
453 */ 453 */
454static void nfs_invalidate_page(struct page *page, unsigned long offset) 454static void nfs_invalidate_page(struct page *page, unsigned int offset,
455 unsigned int length)
455{ 456{
456 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 457 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
458 page, offset, length);
457 459
458 if (offset != 0) 460 if (offset != 0 || length < PAGE_CACHE_SIZE)
459 return; 461 return;
460 /* Cancel any unstarted writes on this page */ 462 /* Cancel any unstarted writes on this page */
461 nfs_wb_page_cancel(page_file_mapping(page)->host, page); 463 nfs_wb_page_cancel(page_file_mapping(page)->host, page);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
1372 * The page may have dirty, unmapped buffers. Make them 1372 * The page may have dirty, unmapped buffers. Make them
1373 * freeable here, so the page does not leak. 1373 * freeable here, so the page does not leak.
1374 */ 1374 */
1375 block_invalidatepage(page, 0); 1375 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1376 unlock_page(page); 1376 unlock_page(page);
1377 ntfs_debug("Write outside i_size - truncated?"); 1377 ntfs_debug("Write outside i_size - truncated?");
1378 return 0; 1378 return 0;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..79736a28d84f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
603 * from ext3. PageChecked() bits have been removed as OCFS2 does not 603 * from ext3. PageChecked() bits have been removed as OCFS2 does not
604 * do journalled data. 604 * do journalled data.
605 */ 605 */
606static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 606static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
607 unsigned int length)
607{ 608{
608 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 609 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
609 610
610 jbd2_journal_invalidatepage(journal, page, offset); 611 jbd2_journal_invalidatepage(journal, page, offset, length);
611} 612}
612 613
613static int ocfs2_releasepage(struct page *page, gfp_t wait) 614static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f844533792ee..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2975,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2975} 2975}
2976 2976
2977/* clm -- taken from fs/buffer.c:block_invalidate_page */ 2977/* clm -- taken from fs/buffer.c:block_invalidate_page */
2978static void reiserfs_invalidatepage(struct page *page, unsigned long offset) 2978static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
2979 unsigned int length)
2979{ 2980{
2980 struct buffer_head *head, *bh, *next; 2981 struct buffer_head *head, *bh, *next;
2981 struct inode *inode = page->mapping->host; 2982 struct inode *inode = page->mapping->host;
2982 unsigned int curr_off = 0; 2983 unsigned int curr_off = 0;
2984 unsigned int stop = offset + length;
2985 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2983 int ret = 1; 2986 int ret = 1;
2984 2987
2985 BUG_ON(!PageLocked(page)); 2988 BUG_ON(!PageLocked(page));
2986 2989
2987 if (offset == 0) 2990 if (!partial_page)
2988 ClearPageChecked(page); 2991 ClearPageChecked(page);
2989 2992
2990 if (!page_has_buffers(page)) 2993 if (!page_has_buffers(page))
@@ -2996,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
2996 unsigned int next_off = curr_off + bh->b_size; 2999 unsigned int next_off = curr_off + bh->b_size;
2997 next = bh->b_this_page; 3000 next = bh->b_this_page;
2998 3001
3002 if (next_off > stop)
3003 goto out;
3004
2999 /* 3005 /*
3000 * is this block fully invalidated? 3006 * is this block fully invalidated?
3001 */ 3007 */
@@ -3014,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
3014 * The get_block cached value has been unconditionally invalidated, 3020 * The get_block cached value has been unconditionally invalidated,
3015 * so real IO is not possible anymore. 3021 * so real IO is not possible anymore.
3016 */ 3022 */
3017 if (!offset && ret) { 3023 if (!partial_page && ret) {
3018 ret = try_to_release_page(page, 0); 3024 ret = try_to_release_page(page, 0);
3019 /* maybe should BUG_ON(!ret); - neilb */ 3025 /* maybe should BUG_ON(!ret); - neilb */
3020 } 3026 }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 14374530784c..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
1277 return err; 1277 return err;
1278} 1278}
1279 1279
1280static void ubifs_invalidatepage(struct page *page, unsigned long offset) 1280static void ubifs_invalidatepage(struct page *page, unsigned int offset,
1281 unsigned int length)
1281{ 1282{
1282 struct inode *inode = page->mapping->host; 1283 struct inode *inode = page->mapping->host;
1283 struct ubifs_info *c = inode->i_sb->s_fs_info; 1284 struct ubifs_info *c = inode->i_sb->s_fs_info;
1284 1285
1285 ubifs_assert(PagePrivate(page)); 1286 ubifs_assert(PagePrivate(page));
1286 if (offset) 1287 if (offset || length < PAGE_CACHE_SIZE)
1287 /* Partial page remains dirty */ 1288 /* Partial page remains dirty */
1288 return; 1289 return;
1289 1290
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41a695048be7..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -843,10 +843,12 @@ xfs_cluster_write(
843STATIC void 843STATIC void
844xfs_vm_invalidatepage( 844xfs_vm_invalidatepage(
845 struct page *page, 845 struct page *page,
846 unsigned long offset) 846 unsigned int offset,
847 unsigned int length)
847{ 848{
848 trace_xfs_invalidatepage(page->mapping->host, page, offset); 849 trace_xfs_invalidatepage(page->mapping->host, page, offset,
849 block_invalidatepage(page, offset); 850 length);
851 block_invalidatepage(page, offset, length);
850} 852}
851 853
852/* 854/*
@@ -910,7 +912,7 @@ next_buffer:
910 912
911 xfs_iunlock(ip, XFS_ILOCK_EXCL); 913 xfs_iunlock(ip, XFS_ILOCK_EXCL);
912out_invalidate: 914out_invalidate:
913 xfs_vm_invalidatepage(page, 0); 915 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
914 return; 916 return;
915} 917}
916 918
@@ -940,7 +942,7 @@ xfs_vm_writepage(
940 int count = 0; 942 int count = 0;
941 int nonblocking = 0; 943 int nonblocking = 0;
942 944
943 trace_xfs_writepage(inode, page, 0); 945 trace_xfs_writepage(inode, page, 0, 0);
944 946
945 ASSERT(page_has_buffers(page)); 947 ASSERT(page_has_buffers(page));
946 948
@@ -1171,7 +1173,7 @@ xfs_vm_releasepage(
1171{ 1173{
1172 int delalloc, unwritten; 1174 int delalloc, unwritten;
1173 1175
1174 trace_xfs_releasepage(page->mapping->host, page, 0); 1176 trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1175 1177
1176 xfs_count_page_state(page, &delalloc, &unwritten); 1178 xfs_count_page_state(page, &delalloc, &unwritten);
1177 1179
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db3307d36..a04701de6bbd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -974,14 +974,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
974DEFINE_RW_EVENT(xfs_file_splice_write); 974DEFINE_RW_EVENT(xfs_file_splice_write);
975 975
976DECLARE_EVENT_CLASS(xfs_page_class, 976DECLARE_EVENT_CLASS(xfs_page_class,
977 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), 977 TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
978 TP_ARGS(inode, page, off), 978 unsigned int len),
979 TP_ARGS(inode, page, off, len),
979 TP_STRUCT__entry( 980 TP_STRUCT__entry(
980 __field(dev_t, dev) 981 __field(dev_t, dev)
981 __field(xfs_ino_t, ino) 982 __field(xfs_ino_t, ino)
982 __field(pgoff_t, pgoff) 983 __field(pgoff_t, pgoff)
983 __field(loff_t, size) 984 __field(loff_t, size)
984 __field(unsigned long, offset) 985 __field(unsigned long, offset)
986 __field(unsigned int, length)
985 __field(int, delalloc) 987 __field(int, delalloc)
986 __field(int, unwritten) 988 __field(int, unwritten)
987 ), 989 ),
@@ -995,24 +997,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
995 __entry->pgoff = page_offset(page); 997 __entry->pgoff = page_offset(page);
996 __entry->size = i_size_read(inode); 998 __entry->size = i_size_read(inode);
997 __entry->offset = off; 999 __entry->offset = off;
1000 __entry->length = len;
998 __entry->delalloc = delalloc; 1001 __entry->delalloc = delalloc;
999 __entry->unwritten = unwritten; 1002 __entry->unwritten = unwritten;
1000 ), 1003 ),
1001 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " 1004 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
1002 "delalloc %d unwritten %d", 1005 "length %x delalloc %d unwritten %d",
1003 MAJOR(__entry->dev), MINOR(__entry->dev), 1006 MAJOR(__entry->dev), MINOR(__entry->dev),
1004 __entry->ino, 1007 __entry->ino,
1005 __entry->pgoff, 1008 __entry->pgoff,
1006 __entry->size, 1009 __entry->size,
1007 __entry->offset, 1010 __entry->offset,
1011 __entry->length,
1008 __entry->delalloc, 1012 __entry->delalloc,
1009 __entry->unwritten) 1013 __entry->unwritten)
1010) 1014)
1011 1015
1012#define DEFINE_PAGE_EVENT(name) \ 1016#define DEFINE_PAGE_EVENT(name) \
1013DEFINE_EVENT(xfs_page_class, name, \ 1017DEFINE_EVENT(xfs_page_class, name, \
1014 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ 1018 TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
1015 TP_ARGS(inode, page, off)) 1019 unsigned int len), \
1020 TP_ARGS(inode, page, off, len))
1016DEFINE_PAGE_EVENT(xfs_writepage); 1021DEFINE_PAGE_EVENT(xfs_writepage);
1017DEFINE_PAGE_EVENT(xfs_releasepage); 1022DEFINE_PAGE_EVENT(xfs_releasepage);
1018DEFINE_PAGE_EVENT(xfs_invalidatepage); 1023DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 9e52b0626b39..f5a3b838ddb0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -198,7 +198,8 @@ extern int buffer_heads_over_limit;
198 * Generic address_space_operations implementations for buffer_head-backed 198 * Generic address_space_operations implementations for buffer_head-backed
199 * address_spaces. 199 * address_spaces.
200 */ 200 */
201void block_invalidatepage(struct page *page, unsigned long offset); 201void block_invalidatepage(struct page *page, unsigned int offset,
202 unsigned int length);
202int block_write_full_page(struct page *page, get_block_t *get_block, 203int block_write_full_page(struct page *page, get_block_t *get_block,
203 struct writeback_control *wbc); 204 struct writeback_control *wbc);
204int block_write_full_page_endio(struct page *page, get_block_t *get_block, 205int block_write_full_page_endio(struct page *page, get_block_t *get_block,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7c30e3a62baf..f8a5240541b7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -364,7 +364,7 @@ struct address_space_operations {
364 364
365 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ 365 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
366 sector_t (*bmap)(struct address_space *, sector_t); 366 sector_t (*bmap)(struct address_space *, sector_t);
367 void (*invalidatepage) (struct page *, unsigned long); 367 void (*invalidatepage) (struct page *, unsigned int, unsigned int);
368 int (*releasepage) (struct page *, gfp_t); 368 int (*releasepage) (struct page *, gfp_t);
369 void (*freepage)(struct page *); 369 void (*freepage)(struct page *);
370 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 370 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 7e0b622503c4..8685d1be12c7 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -27,7 +27,6 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/journal-head.h> 28#include <linux/journal-head.h>
29#include <linux/stddef.h> 29#include <linux/stddef.h>
30#include <linux/bit_spinlock.h>
31#include <linux/mutex.h> 30#include <linux/mutex.h>
32#include <linux/timer.h> 31#include <linux/timer.h>
33#include <linux/lockdep.h> 32#include <linux/lockdep.h>
@@ -244,6 +243,31 @@ typedef struct journal_superblock_s
244 243
245#include <linux/fs.h> 244#include <linux/fs.h>
246#include <linux/sched.h> 245#include <linux/sched.h>
246
247enum jbd_state_bits {
248 BH_JBD /* Has an attached ext3 journal_head */
249 = BH_PrivateStart,
250 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
251 BH_Freed, /* Has been freed (truncated) */
252 BH_Revoked, /* Has been revoked from the log */
253 BH_RevokeValid, /* Revoked flag is valid */
254 BH_JBDDirty, /* Is dirty but journaled */
255 BH_State, /* Pins most journal_head state */
256 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
257 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
258 BH_JBDPrivateStart, /* First bit available for private use by FS */
259};
260
261BUFFER_FNS(JBD, jbd)
262BUFFER_FNS(JWrite, jwrite)
263BUFFER_FNS(JBDDirty, jbddirty)
264TAS_BUFFER_FNS(JBDDirty, jbddirty)
265BUFFER_FNS(Revoked, revoked)
266TAS_BUFFER_FNS(Revoked, revoked)
267BUFFER_FNS(RevokeValid, revokevalid)
268TAS_BUFFER_FNS(RevokeValid, revokevalid)
269BUFFER_FNS(Freed, freed)
270
247#include <linux/jbd_common.h> 271#include <linux/jbd_common.h>
248 272
249#define J_ASSERT(assert) BUG_ON(!(assert)) 273#define J_ASSERT(assert) BUG_ON(!(assert))
@@ -840,7 +864,7 @@ extern void journal_release_buffer (handle_t *, struct buffer_head *);
840extern int journal_forget (handle_t *, struct buffer_head *); 864extern int journal_forget (handle_t *, struct buffer_head *);
841extern void journal_sync_buffer (struct buffer_head *); 865extern void journal_sync_buffer (struct buffer_head *);
842extern void journal_invalidatepage(journal_t *, 866extern void journal_invalidatepage(journal_t *,
843 struct page *, unsigned long); 867 struct page *, unsigned int, unsigned int);
844extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); 868extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
845extern int journal_stop(handle_t *); 869extern int journal_stop(handle_t *);
846extern int journal_flush (journal_t *); 870extern int journal_flush (journal_t *);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 6e051f472edb..d5b50a19463c 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -26,7 +26,6 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/journal-head.h> 27#include <linux/journal-head.h>
28#include <linux/stddef.h> 28#include <linux/stddef.h>
29#include <linux/bit_spinlock.h>
30#include <linux/mutex.h> 29#include <linux/mutex.h>
31#include <linux/timer.h> 30#include <linux/timer.h>
32#include <linux/slab.h> 31#include <linux/slab.h>
@@ -57,17 +56,13 @@
57 */ 56 */
58#define JBD2_EXPENSIVE_CHECKING 57#define JBD2_EXPENSIVE_CHECKING
59extern ushort jbd2_journal_enable_debug; 58extern ushort jbd2_journal_enable_debug;
59void __jbd2_debug(int level, const char *file, const char *func,
60 unsigned int line, const char *fmt, ...);
60 61
61#define jbd_debug(n, f, a...) \ 62#define jbd_debug(n, fmt, a...) \
62 do { \ 63 __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
63 if ((n) <= jbd2_journal_enable_debug) { \
64 printk (KERN_DEBUG "(%s, %d): %s: ", \
65 __FILE__, __LINE__, __func__); \
66 printk (f, ## a); \
67 } \
68 } while (0)
69#else 64#else
70#define jbd_debug(f, a...) /**/ 65#define jbd_debug(n, fmt, a...) /**/
71#endif 66#endif
72 67
73extern void *jbd2_alloc(size_t size, gfp_t flags); 68extern void *jbd2_alloc(size_t size, gfp_t flags);
@@ -302,6 +297,34 @@ typedef struct journal_superblock_s
302 297
303#include <linux/fs.h> 298#include <linux/fs.h>
304#include <linux/sched.h> 299#include <linux/sched.h>
300
301enum jbd_state_bits {
302 BH_JBD /* Has an attached ext3 journal_head */
303 = BH_PrivateStart,
304 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
305 BH_Freed, /* Has been freed (truncated) */
306 BH_Revoked, /* Has been revoked from the log */
307 BH_RevokeValid, /* Revoked flag is valid */
308 BH_JBDDirty, /* Is dirty but journaled */
309 BH_State, /* Pins most journal_head state */
310 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
311 BH_Shadow, /* IO on shadow buffer is running */
312 BH_Verified, /* Metadata block has been verified ok */
313 BH_JBDPrivateStart, /* First bit available for private use by FS */
314};
315
316BUFFER_FNS(JBD, jbd)
317BUFFER_FNS(JWrite, jwrite)
318BUFFER_FNS(JBDDirty, jbddirty)
319TAS_BUFFER_FNS(JBDDirty, jbddirty)
320BUFFER_FNS(Revoked, revoked)
321TAS_BUFFER_FNS(Revoked, revoked)
322BUFFER_FNS(RevokeValid, revokevalid)
323TAS_BUFFER_FNS(RevokeValid, revokevalid)
324BUFFER_FNS(Freed, freed)
325BUFFER_FNS(Shadow, shadow)
326BUFFER_FNS(Verified, verified)
327
305#include <linux/jbd_common.h> 328#include <linux/jbd_common.h>
306 329
307#define J_ASSERT(assert) BUG_ON(!(assert)) 330#define J_ASSERT(assert) BUG_ON(!(assert))
@@ -382,8 +405,15 @@ struct jbd2_revoke_table_s;
382 405
383struct jbd2_journal_handle 406struct jbd2_journal_handle
384{ 407{
385 /* Which compound transaction is this update a part of? */ 408 union {
386 transaction_t *h_transaction; 409 /* Which compound transaction is this update a part of? */
410 transaction_t *h_transaction;
411 /* Which journal handle belongs to - used iff h_reserved set */
412 journal_t *h_journal;
413 };
414
415 /* Handle reserved for finishing the logical operation */
416 handle_t *h_rsv_handle;
387 417
388 /* Number of remaining buffers we are allowed to dirty: */ 418 /* Number of remaining buffers we are allowed to dirty: */
389 int h_buffer_credits; 419 int h_buffer_credits;
@@ -398,6 +428,7 @@ struct jbd2_journal_handle
398 /* Flags [no locking] */ 428 /* Flags [no locking] */
399 unsigned int h_sync: 1; /* sync-on-close */ 429 unsigned int h_sync: 1; /* sync-on-close */
400 unsigned int h_jdata: 1; /* force data journaling */ 430 unsigned int h_jdata: 1; /* force data journaling */
431 unsigned int h_reserved: 1; /* handle with reserved credits */
401 unsigned int h_aborted: 1; /* fatal error on handle */ 432 unsigned int h_aborted: 1; /* fatal error on handle */
402 unsigned int h_type: 8; /* for handle statistics */ 433 unsigned int h_type: 8; /* for handle statistics */
403 unsigned int h_line_no: 16; /* for handle statistics */ 434 unsigned int h_line_no: 16; /* for handle statistics */
@@ -524,12 +555,6 @@ struct transaction_s
524 struct journal_head *t_checkpoint_io_list; 555 struct journal_head *t_checkpoint_io_list;
525 556
526 /* 557 /*
527 * Doubly-linked circular list of temporary buffers currently undergoing
528 * IO in the log [j_list_lock]
529 */
530 struct journal_head *t_iobuf_list;
531
532 /*
533 * Doubly-linked circular list of metadata buffers being shadowed by log 558 * Doubly-linked circular list of metadata buffers being shadowed by log
534 * IO. The IO buffers on the iobuf list and the shadow buffers on this 559 * IO. The IO buffers on the iobuf list and the shadow buffers on this
535 * list match each other one for one at all times. [j_list_lock] 560 * list match each other one for one at all times. [j_list_lock]
@@ -537,12 +562,6 @@ struct transaction_s
537 struct journal_head *t_shadow_list; 562 struct journal_head *t_shadow_list;
538 563
539 /* 564 /*
540 * Doubly-linked circular list of control buffers being written to the
541 * log. [j_list_lock]
542 */
543 struct journal_head *t_log_list;
544
545 /*
546 * List of inodes whose data we've modified in data=ordered mode. 565 * List of inodes whose data we've modified in data=ordered mode.
547 * [j_list_lock] 566 * [j_list_lock]
548 */ 567 */
@@ -671,11 +690,10 @@ jbd2_time_diff(unsigned long start, unsigned long end)
671 * waiting for checkpointing 690 * waiting for checkpointing
672 * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction 691 * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction
673 * to start committing, or for a barrier lock to be released 692 * to start committing, or for a barrier lock to be released
674 * @j_wait_logspace: Wait queue for waiting for checkpointing to complete
675 * @j_wait_done_commit: Wait queue for waiting for commit to complete 693 * @j_wait_done_commit: Wait queue for waiting for commit to complete
676 * @j_wait_checkpoint: Wait queue to trigger checkpointing
677 * @j_wait_commit: Wait queue to trigger commit 694 * @j_wait_commit: Wait queue to trigger commit
678 * @j_wait_updates: Wait queue to wait for updates to complete 695 * @j_wait_updates: Wait queue to wait for updates to complete
696 * @j_wait_reserved: Wait queue to wait for reserved buffer credits to drop
679 * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints 697 * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
680 * @j_head: Journal head - identifies the first unused block in the journal 698 * @j_head: Journal head - identifies the first unused block in the journal
681 * @j_tail: Journal tail - identifies the oldest still-used block in the 699 * @j_tail: Journal tail - identifies the oldest still-used block in the
@@ -689,6 +707,7 @@ jbd2_time_diff(unsigned long start, unsigned long end)
689 * journal 707 * journal
690 * @j_fs_dev: Device which holds the client fs. For internal journal this will 708 * @j_fs_dev: Device which holds the client fs. For internal journal this will
691 * be equal to j_dev 709 * be equal to j_dev
710 * @j_reserved_credits: Number of buffers reserved from the running transaction
692 * @j_maxlen: Total maximum capacity of the journal region on disk. 711 * @j_maxlen: Total maximum capacity of the journal region on disk.
693 * @j_list_lock: Protects the buffer lists and internal buffer state. 712 * @j_list_lock: Protects the buffer lists and internal buffer state.
694 * @j_inode: Optional inode where we store the journal. If present, all journal 713 * @j_inode: Optional inode where we store the journal. If present, all journal
@@ -778,21 +797,18 @@ struct journal_s
778 */ 797 */
779 wait_queue_head_t j_wait_transaction_locked; 798 wait_queue_head_t j_wait_transaction_locked;
780 799
781 /* Wait queue for waiting for checkpointing to complete */
782 wait_queue_head_t j_wait_logspace;
783
784 /* Wait queue for waiting for commit to complete */ 800 /* Wait queue for waiting for commit to complete */
785 wait_queue_head_t j_wait_done_commit; 801 wait_queue_head_t j_wait_done_commit;
786 802
787 /* Wait queue to trigger checkpointing */
788 wait_queue_head_t j_wait_checkpoint;
789
790 /* Wait queue to trigger commit */ 803 /* Wait queue to trigger commit */
791 wait_queue_head_t j_wait_commit; 804 wait_queue_head_t j_wait_commit;
792 805
793 /* Wait queue to wait for updates to complete */ 806 /* Wait queue to wait for updates to complete */
794 wait_queue_head_t j_wait_updates; 807 wait_queue_head_t j_wait_updates;
795 808
809 /* Wait queue to wait for reserved buffer credits to drop */
810 wait_queue_head_t j_wait_reserved;
811
796 /* Semaphore for locking against concurrent checkpoints */ 812 /* Semaphore for locking against concurrent checkpoints */
797 struct mutex j_checkpoint_mutex; 813 struct mutex j_checkpoint_mutex;
798 814
@@ -847,6 +863,9 @@ struct journal_s
847 /* Total maximum capacity of the journal region on disk. */ 863 /* Total maximum capacity of the journal region on disk. */
848 unsigned int j_maxlen; 864 unsigned int j_maxlen;
849 865
866 /* Number of buffers reserved from the running transaction */
867 atomic_t j_reserved_credits;
868
850 /* 869 /*
851 * Protects the buffer lists and internal buffer state. 870 * Protects the buffer lists and internal buffer state.
852 */ 871 */
@@ -991,9 +1010,17 @@ extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, i
991extern void __journal_free_buffer(struct journal_head *bh); 1010extern void __journal_free_buffer(struct journal_head *bh);
992extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); 1011extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
993extern void __journal_clean_data_list(transaction_t *transaction); 1012extern void __journal_clean_data_list(transaction_t *transaction);
1013static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh)
1014{
1015 list_add_tail(&bh->b_assoc_buffers, head);
1016}
1017static inline void jbd2_unfile_log_bh(struct buffer_head *bh)
1018{
1019 list_del_init(&bh->b_assoc_buffers);
1020}
994 1021
995/* Log buffer allocation */ 1022/* Log buffer allocation */
996extern struct journal_head * jbd2_journal_get_descriptor_buffer(journal_t *); 1023struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal);
997int jbd2_journal_next_log_block(journal_t *, unsigned long long *); 1024int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
998int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, 1025int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
999 unsigned long *block); 1026 unsigned long *block);
@@ -1039,11 +1066,10 @@ extern void jbd2_buffer_abort_trigger(struct journal_head *jh,
1039 struct jbd2_buffer_trigger_type *triggers); 1066 struct jbd2_buffer_trigger_type *triggers);
1040 1067
1041/* Buffer IO */ 1068/* Buffer IO */
1042extern int 1069extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
1043jbd2_journal_write_metadata_buffer(transaction_t *transaction, 1070 struct journal_head *jh_in,
1044 struct journal_head *jh_in, 1071 struct buffer_head **bh_out,
1045 struct journal_head **jh_out, 1072 sector_t blocknr);
1046 unsigned long long blocknr);
1047 1073
1048/* Transaction locking */ 1074/* Transaction locking */
1049extern void __wait_on_journal (journal_t *); 1075extern void __wait_on_journal (journal_t *);
@@ -1076,10 +1102,14 @@ static inline handle_t *journal_current_handle(void)
1076 */ 1102 */
1077 1103
1078extern handle_t *jbd2_journal_start(journal_t *, int nblocks); 1104extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
1079extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask, 1105extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
1080 unsigned int type, unsigned int line_no); 1106 gfp_t gfp_mask, unsigned int type,
1107 unsigned int line_no);
1081extern int jbd2_journal_restart(handle_t *, int nblocks); 1108extern int jbd2_journal_restart(handle_t *, int nblocks);
1082extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask); 1109extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask);
1110extern int jbd2_journal_start_reserved(handle_t *handle,
1111 unsigned int type, unsigned int line_no);
1112extern void jbd2_journal_free_reserved(handle_t *handle);
1083extern int jbd2_journal_extend (handle_t *, int nblocks); 1113extern int jbd2_journal_extend (handle_t *, int nblocks);
1084extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); 1114extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
1085extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); 1115extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
@@ -1090,7 +1120,7 @@ extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
1090extern int jbd2_journal_forget (handle_t *, struct buffer_head *); 1120extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
1091extern void journal_sync_buffer (struct buffer_head *); 1121extern void journal_sync_buffer (struct buffer_head *);
1092extern int jbd2_journal_invalidatepage(journal_t *, 1122extern int jbd2_journal_invalidatepage(journal_t *,
1093 struct page *, unsigned long); 1123 struct page *, unsigned int, unsigned int);
1094extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); 1124extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
1095extern int jbd2_journal_stop(handle_t *); 1125extern int jbd2_journal_stop(handle_t *);
1096extern int jbd2_journal_flush (journal_t *); 1126extern int jbd2_journal_flush (journal_t *);
@@ -1125,6 +1155,7 @@ extern void jbd2_journal_ack_err (journal_t *);
1125extern int jbd2_journal_clear_err (journal_t *); 1155extern int jbd2_journal_clear_err (journal_t *);
1126extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); 1156extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
1127extern int jbd2_journal_force_commit(journal_t *); 1157extern int jbd2_journal_force_commit(journal_t *);
1158extern int jbd2_journal_force_commit_nested(journal_t *);
1128extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); 1159extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
1129extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, 1160extern int jbd2_journal_begin_ordered_truncate(journal_t *journal,
1130 struct jbd2_inode *inode, loff_t new_size); 1161 struct jbd2_inode *inode, loff_t new_size);
@@ -1178,8 +1209,10 @@ extern int jbd2_journal_init_revoke_caches(void);
1178extern void jbd2_journal_destroy_revoke(journal_t *); 1209extern void jbd2_journal_destroy_revoke(journal_t *);
1179extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); 1210extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
1180extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); 1211extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
1181extern void jbd2_journal_write_revoke_records(journal_t *, 1212extern void jbd2_journal_write_revoke_records(journal_t *journal,
1182 transaction_t *, int); 1213 transaction_t *transaction,
1214 struct list_head *log_bufs,
1215 int write_op);
1183 1216
1184/* Recovery revoke support */ 1217/* Recovery revoke support */
1185extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); 1218extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
@@ -1195,11 +1228,9 @@ extern void jbd2_clear_buffer_revoked_flags(journal_t *journal);
1195 * transitions on demand. 1228 * transitions on demand.
1196 */ 1229 */
1197 1230
1198int __jbd2_log_space_left(journal_t *); /* Called with journal locked */
1199int jbd2_log_start_commit(journal_t *journal, tid_t tid); 1231int jbd2_log_start_commit(journal_t *journal, tid_t tid);
1200int __jbd2_log_start_commit(journal_t *journal, tid_t tid); 1232int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
1201int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); 1233int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
1202int jbd2_journal_force_commit_nested(journal_t *journal);
1203int jbd2_log_wait_commit(journal_t *journal, tid_t tid); 1234int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
1204int jbd2_complete_transaction(journal_t *journal, tid_t tid); 1235int jbd2_complete_transaction(journal_t *journal, tid_t tid);
1205int jbd2_log_do_checkpoint(journal_t *journal); 1236int jbd2_log_do_checkpoint(journal_t *journal);
@@ -1235,7 +1266,7 @@ static inline int is_journal_aborted(journal_t *journal)
1235 1266
1236static inline int is_handle_aborted(handle_t *handle) 1267static inline int is_handle_aborted(handle_t *handle)
1237{ 1268{
1238 if (handle->h_aborted) 1269 if (handle->h_aborted || !handle->h_transaction)
1239 return 1; 1270 return 1;
1240 return is_journal_aborted(handle->h_transaction->t_journal); 1271 return is_journal_aborted(handle->h_transaction->t_journal);
1241} 1272}
@@ -1266,16 +1297,37 @@ extern int jbd2_journal_blocks_per_page(struct inode *inode);
1266extern size_t journal_tag_bytes(journal_t *journal); 1297extern size_t journal_tag_bytes(journal_t *journal);
1267 1298
1268/* 1299/*
1300 * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for
1301 * transaction control blocks.
1302 */
1303#define JBD2_CONTROL_BLOCKS_SHIFT 5
1304
1305/*
1269 * Return the minimum number of blocks which must be free in the journal 1306 * Return the minimum number of blocks which must be free in the journal
1270 * before a new transaction may be started. Must be called under j_state_lock. 1307 * before a new transaction may be started. Must be called under j_state_lock.
1271 */ 1308 */
1272static inline int jbd_space_needed(journal_t *journal) 1309static inline int jbd2_space_needed(journal_t *journal)
1273{ 1310{
1274 int nblocks = journal->j_max_transaction_buffers; 1311 int nblocks = journal->j_max_transaction_buffers;
1275 if (journal->j_committing_transaction) 1312 return nblocks + (nblocks >> JBD2_CONTROL_BLOCKS_SHIFT);
1276 nblocks += atomic_read(&journal->j_committing_transaction-> 1313}
1277 t_outstanding_credits); 1314
1278 return nblocks; 1315/*
1316 * Return number of free blocks in the log. Must be called under j_state_lock.
1317 */
1318static inline unsigned long jbd2_log_space_left(journal_t *journal)
1319{
1320 /* Allow for rounding errors */
1321 unsigned long free = journal->j_free - 32;
1322
1323 if (journal->j_committing_transaction) {
1324 unsigned long committing = atomic_read(&journal->
1325 j_committing_transaction->t_outstanding_credits);
1326
1327 /* Transaction + control blocks */
1328 free -= committing + (committing >> JBD2_CONTROL_BLOCKS_SHIFT);
1329 }
1330 return free;
1279} 1331}
1280 1332
1281/* 1333/*
@@ -1286,11 +1338,9 @@ static inline int jbd_space_needed(journal_t *journal)
1286#define BJ_None 0 /* Not journaled */ 1338#define BJ_None 0 /* Not journaled */
1287#define BJ_Metadata 1 /* Normal journaled metadata */ 1339#define BJ_Metadata 1 /* Normal journaled metadata */
1288#define BJ_Forget 2 /* Buffer superseded by this transaction */ 1340#define BJ_Forget 2 /* Buffer superseded by this transaction */
1289#define BJ_IO 3 /* Buffer is for temporary IO use */ 1341#define BJ_Shadow 3 /* Buffer contents being shadowed to the log */
1290#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */ 1342#define BJ_Reserved 4 /* Buffer is reserved for access by journal */
1291#define BJ_LogCtl 5 /* Buffer contains log descriptors */ 1343#define BJ_Types 5
1292#define BJ_Reserved 6 /* Buffer is reserved for access by journal */
1293#define BJ_Types 7
1294 1344
1295extern int jbd_blocks_per_page(struct inode *inode); 1345extern int jbd_blocks_per_page(struct inode *inode);
1296 1346
@@ -1319,6 +1369,19 @@ static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
1319 return *(u32 *)desc.ctx; 1369 return *(u32 *)desc.ctx;
1320} 1370}
1321 1371
1372/* Return most recent uncommitted transaction */
1373static inline tid_t jbd2_get_latest_transaction(journal_t *journal)
1374{
1375 tid_t tid;
1376
1377 read_lock(&journal->j_state_lock);
1378 tid = journal->j_commit_request;
1379 if (journal->j_running_transaction)
1380 tid = journal->j_running_transaction->t_tid;
1381 read_unlock(&journal->j_state_lock);
1382 return tid;
1383}
1384
1322#ifdef __KERNEL__ 1385#ifdef __KERNEL__
1323 1386
1324#define buffer_trace_init(bh) do {} while (0) 1387#define buffer_trace_init(bh) do {} while (0)
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
index 6133679bc4c0..3dc53432355f 100644
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h
@@ -1,31 +1,7 @@
1#ifndef _LINUX_JBD_STATE_H 1#ifndef _LINUX_JBD_STATE_H
2#define _LINUX_JBD_STATE_H 2#define _LINUX_JBD_STATE_H
3 3
4enum jbd_state_bits { 4#include <linux/bit_spinlock.h>
5 BH_JBD /* Has an attached ext3 journal_head */
6 = BH_PrivateStart,
7 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
8 BH_Freed, /* Has been freed (truncated) */
9 BH_Revoked, /* Has been revoked from the log */
10 BH_RevokeValid, /* Revoked flag is valid */
11 BH_JBDDirty, /* Is dirty but journaled */
12 BH_State, /* Pins most journal_head state */
13 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
14 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
15 BH_Verified, /* Metadata block has been verified ok */
16 BH_JBDPrivateStart, /* First bit available for private use by FS */
17};
18
19BUFFER_FNS(JBD, jbd)
20BUFFER_FNS(JWrite, jwrite)
21BUFFER_FNS(JBDDirty, jbddirty)
22TAS_BUFFER_FNS(JBDDirty, jbddirty)
23BUFFER_FNS(Revoked, revoked)
24TAS_BUFFER_FNS(Revoked, revoked)
25BUFFER_FNS(RevokeValid, revokevalid)
26TAS_BUFFER_FNS(RevokeValid, revokevalid)
27BUFFER_FNS(Freed, freed)
28BUFFER_FNS(Verified, verified)
29 5
30static inline struct buffer_head *jh2bh(struct journal_head *jh) 6static inline struct buffer_head *jh2bh(struct journal_head *jh)
31{ 7{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e0c8528a41a4..66d881f1d576 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1041,7 +1041,8 @@ int get_kernel_page(unsigned long start, int write, struct page **pages);
1041struct page *get_dump_page(unsigned long addr); 1041struct page *get_dump_page(unsigned long addr);
1042 1042
1043extern int try_to_release_page(struct page * page, gfp_t gfp_mask); 1043extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
1044extern void do_invalidatepage(struct page *page, unsigned long offset); 1044extern void do_invalidatepage(struct page *page, unsigned int offset,
1045 unsigned int length);
1045 1046
1046int __set_page_dirty_nobuffers(struct page *page); 1047int __set_page_dirty_nobuffers(struct page *page);
1047int __set_page_dirty_no_writeback(struct page *page); 1048int __set_page_dirty_no_writeback(struct page *page);
diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h
index 15d11a39be47..6797b9de90ed 100644
--- a/include/trace/events/ext3.h
+++ b/include/trace/events/ext3.h
@@ -290,13 +290,14 @@ DEFINE_EVENT(ext3__page_op, ext3_releasepage,
290); 290);
291 291
292TRACE_EVENT(ext3_invalidatepage, 292TRACE_EVENT(ext3_invalidatepage,
293 TP_PROTO(struct page *page, unsigned long offset), 293 TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
294 294
295 TP_ARGS(page, offset), 295 TP_ARGS(page, offset, length),
296 296
297 TP_STRUCT__entry( 297 TP_STRUCT__entry(
298 __field( pgoff_t, index ) 298 __field( pgoff_t, index )
299 __field( unsigned long, offset ) 299 __field( unsigned int, offset )
300 __field( unsigned int, length )
300 __field( ino_t, ino ) 301 __field( ino_t, ino )
301 __field( dev_t, dev ) 302 __field( dev_t, dev )
302 303
@@ -305,14 +306,15 @@ TRACE_EVENT(ext3_invalidatepage,
305 TP_fast_assign( 306 TP_fast_assign(
306 __entry->index = page->index; 307 __entry->index = page->index;
307 __entry->offset = offset; 308 __entry->offset = offset;
309 __entry->length = length;
308 __entry->ino = page->mapping->host->i_ino; 310 __entry->ino = page->mapping->host->i_ino;
309 __entry->dev = page->mapping->host->i_sb->s_dev; 311 __entry->dev = page->mapping->host->i_sb->s_dev;
310 ), 312 ),
311 313
312 TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", 314 TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u",
313 MAJOR(__entry->dev), MINOR(__entry->dev), 315 MAJOR(__entry->dev), MINOR(__entry->dev),
314 (unsigned long) __entry->ino, 316 (unsigned long) __entry->ino,
315 __entry->index, __entry->offset) 317 __entry->index, __entry->offset, __entry->length)
316); 318);
317 319
318TRACE_EVENT(ext3_discard_blocks, 320TRACE_EVENT(ext3_discard_blocks,
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 8ee15b97cd38..2068db241f22 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -19,6 +19,57 @@ struct extent_status;
19 19
20#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) 20#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
21 21
22#define show_mballoc_flags(flags) __print_flags(flags, "|", \
23 { EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \
24 { EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \
25 { EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \
26 { EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \
27 { EXT4_MB_HINT_BEST, "HINT_BEST" }, \
28 { EXT4_MB_HINT_DATA, "HINT_DATA" }, \
29 { EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \
30 { EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \
31 { EXT4_MB_HINT_GOAL_ONLY, "HINT_GOAL_ONLY" }, \
32 { EXT4_MB_HINT_TRY_GOAL, "HINT_TRY_GOAL" }, \
33 { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \
34 { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \
35 { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \
36 { EXT4_MB_USE_RESERVED, "USE_RESV" })
37
38#define show_map_flags(flags) __print_flags(flags, "|", \
39 { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \
40 { EXT4_GET_BLOCKS_UNINIT_EXT, "UNINIT" }, \
41 { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \
42 { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \
43 { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \
44 { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \
45 { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \
46 { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \
47 { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }, \
48 { EXT4_GET_BLOCKS_NO_PUT_HOLE, "NO_PUT_HOLE" })
49
50#define show_mflags(flags) __print_flags(flags, "", \
51 { EXT4_MAP_NEW, "N" }, \
52 { EXT4_MAP_MAPPED, "M" }, \
53 { EXT4_MAP_UNWRITTEN, "U" }, \
54 { EXT4_MAP_BOUNDARY, "B" }, \
55 { EXT4_MAP_UNINIT, "u" }, \
56 { EXT4_MAP_FROM_CLUSTER, "C" })
57
58#define show_free_flags(flags) __print_flags(flags, "|", \
59 { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \
60 { EXT4_FREE_BLOCKS_FORGET, "FORGET" }, \
61 { EXT4_FREE_BLOCKS_VALIDATED, "VALIDATED" }, \
62 { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE, "NO_QUOTA" }, \
63 { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\
64 { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER, "LAST_CLUSTER" })
65
66#define show_extent_status(status) __print_flags(status, "", \
67 { (1 << 3), "W" }, \
68 { (1 << 2), "U" }, \
69 { (1 << 1), "D" }, \
70 { (1 << 0), "H" })
71
72
22TRACE_EVENT(ext4_free_inode, 73TRACE_EVENT(ext4_free_inode,
23 TP_PROTO(struct inode *inode), 74 TP_PROTO(struct inode *inode),
24 75
@@ -281,7 +332,7 @@ DEFINE_EVENT(ext4__write_end, ext4_da_write_end,
281 TP_ARGS(inode, pos, len, copied) 332 TP_ARGS(inode, pos, len, copied)
282); 333);
283 334
284TRACE_EVENT(ext4_da_writepages, 335TRACE_EVENT(ext4_writepages,
285 TP_PROTO(struct inode *inode, struct writeback_control *wbc), 336 TP_PROTO(struct inode *inode, struct writeback_control *wbc),
286 337
287 TP_ARGS(inode, wbc), 338 TP_ARGS(inode, wbc),
@@ -324,46 +375,62 @@ TRACE_EVENT(ext4_da_writepages,
324); 375);
325 376
326TRACE_EVENT(ext4_da_write_pages, 377TRACE_EVENT(ext4_da_write_pages,
327 TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), 378 TP_PROTO(struct inode *inode, pgoff_t first_page,
379 struct writeback_control *wbc),
328 380
329 TP_ARGS(inode, mpd), 381 TP_ARGS(inode, first_page, wbc),
330 382
331 TP_STRUCT__entry( 383 TP_STRUCT__entry(
332 __field( dev_t, dev ) 384 __field( dev_t, dev )
333 __field( ino_t, ino ) 385 __field( ino_t, ino )
334 __field( __u64, b_blocknr ) 386 __field( pgoff_t, first_page )
335 __field( __u32, b_size ) 387 __field( long, nr_to_write )
336 __field( __u32, b_state ) 388 __field( int, sync_mode )
337 __field( unsigned long, first_page )
338 __field( int, io_done )
339 __field( int, pages_written )
340 __field( int, sync_mode )
341 ), 389 ),
342 390
343 TP_fast_assign( 391 TP_fast_assign(
344 __entry->dev = inode->i_sb->s_dev; 392 __entry->dev = inode->i_sb->s_dev;
345 __entry->ino = inode->i_ino; 393 __entry->ino = inode->i_ino;
346 __entry->b_blocknr = mpd->b_blocknr; 394 __entry->first_page = first_page;
347 __entry->b_size = mpd->b_size; 395 __entry->nr_to_write = wbc->nr_to_write;
348 __entry->b_state = mpd->b_state; 396 __entry->sync_mode = wbc->sync_mode;
349 __entry->first_page = mpd->first_page;
350 __entry->io_done = mpd->io_done;
351 __entry->pages_written = mpd->pages_written;
352 __entry->sync_mode = mpd->wbc->sync_mode;
353 ), 397 ),
354 398
355 TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x " 399 TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
356 "first_page %lu io_done %d pages_written %d sync_mode %d", 400 "sync_mode %d",
357 MAJOR(__entry->dev), MINOR(__entry->dev), 401 MAJOR(__entry->dev), MINOR(__entry->dev),
358 (unsigned long) __entry->ino, 402 (unsigned long) __entry->ino, __entry->first_page,
359 __entry->b_blocknr, __entry->b_size, 403 __entry->nr_to_write, __entry->sync_mode)
360 __entry->b_state, __entry->first_page,
361 __entry->io_done, __entry->pages_written,
362 __entry->sync_mode
363 )
364); 404);
365 405
366TRACE_EVENT(ext4_da_writepages_result, 406TRACE_EVENT(ext4_da_write_pages_extent,
407 TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
408
409 TP_ARGS(inode, map),
410
411 TP_STRUCT__entry(
412 __field( dev_t, dev )
413 __field( ino_t, ino )
414 __field( __u64, lblk )
415 __field( __u32, len )
416 __field( __u32, flags )
417 ),
418
419 TP_fast_assign(
420 __entry->dev = inode->i_sb->s_dev;
421 __entry->ino = inode->i_ino;
422 __entry->lblk = map->m_lblk;
423 __entry->len = map->m_len;
424 __entry->flags = map->m_flags;
425 ),
426
427 TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s",
428 MAJOR(__entry->dev), MINOR(__entry->dev),
429 (unsigned long) __entry->ino, __entry->lblk, __entry->len,
430 show_mflags(__entry->flags))
431);
432
433TRACE_EVENT(ext4_writepages_result,
367 TP_PROTO(struct inode *inode, struct writeback_control *wbc, 434 TP_PROTO(struct inode *inode, struct writeback_control *wbc,
368 int ret, int pages_written), 435 int ret, int pages_written),
369 436
@@ -444,16 +511,16 @@ DEFINE_EVENT(ext4__page_op, ext4_releasepage,
444); 511);
445 512
446DECLARE_EVENT_CLASS(ext4_invalidatepage_op, 513DECLARE_EVENT_CLASS(ext4_invalidatepage_op,
447 TP_PROTO(struct page *page, unsigned long offset), 514 TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
448 515
449 TP_ARGS(page, offset), 516 TP_ARGS(page, offset, length),
450 517
451 TP_STRUCT__entry( 518 TP_STRUCT__entry(
452 __field( dev_t, dev ) 519 __field( dev_t, dev )
453 __field( ino_t, ino ) 520 __field( ino_t, ino )
454 __field( pgoff_t, index ) 521 __field( pgoff_t, index )
455 __field( unsigned long, offset ) 522 __field( unsigned int, offset )
456 523 __field( unsigned int, length )
457 ), 524 ),
458 525
459 TP_fast_assign( 526 TP_fast_assign(
@@ -461,24 +528,26 @@ DECLARE_EVENT_CLASS(ext4_invalidatepage_op,
461 __entry->ino = page->mapping->host->i_ino; 528 __entry->ino = page->mapping->host->i_ino;
462 __entry->index = page->index; 529 __entry->index = page->index;
463 __entry->offset = offset; 530 __entry->offset = offset;
531 __entry->length = length;
464 ), 532 ),
465 533
466 TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", 534 TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u",
467 MAJOR(__entry->dev), MINOR(__entry->dev), 535 MAJOR(__entry->dev), MINOR(__entry->dev),
468 (unsigned long) __entry->ino, 536 (unsigned long) __entry->ino,
469 (unsigned long) __entry->index, __entry->offset) 537 (unsigned long) __entry->index,
538 __entry->offset, __entry->length)
470); 539);
471 540
472DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, 541DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage,
473 TP_PROTO(struct page *page, unsigned long offset), 542 TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
474 543
475 TP_ARGS(page, offset) 544 TP_ARGS(page, offset, length)
476); 545);
477 546
478DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, 547DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage,
479 TP_PROTO(struct page *page, unsigned long offset), 548 TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
480 549
481 TP_ARGS(page, offset) 550 TP_ARGS(page, offset, length)
482); 551);
483 552
484TRACE_EVENT(ext4_discard_blocks, 553TRACE_EVENT(ext4_discard_blocks,
@@ -673,10 +742,10 @@ TRACE_EVENT(ext4_request_blocks,
673 __entry->flags = ar->flags; 742 __entry->flags = ar->flags;
674 ), 743 ),
675 744
676 TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " 745 TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu "
677 "lleft %u lright %u pleft %llu pright %llu ", 746 "lleft %u lright %u pleft %llu pright %llu ",
678 MAJOR(__entry->dev), MINOR(__entry->dev), 747 MAJOR(__entry->dev), MINOR(__entry->dev),
679 (unsigned long) __entry->ino, __entry->flags, 748 (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
680 __entry->len, __entry->logical, __entry->goal, 749 __entry->len, __entry->logical, __entry->goal,
681 __entry->lleft, __entry->lright, __entry->pleft, 750 __entry->lleft, __entry->lright, __entry->pleft,
682 __entry->pright) 751 __entry->pright)
@@ -715,10 +784,10 @@ TRACE_EVENT(ext4_allocate_blocks,
715 __entry->flags = ar->flags; 784 __entry->flags = ar->flags;
716 ), 785 ),
717 786
718 TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " 787 TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u "
719 "goal %llu lleft %u lright %u pleft %llu pright %llu", 788 "goal %llu lleft %u lright %u pleft %llu pright %llu",
720 MAJOR(__entry->dev), MINOR(__entry->dev), 789 MAJOR(__entry->dev), MINOR(__entry->dev),
721 (unsigned long) __entry->ino, __entry->flags, 790 (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
722 __entry->len, __entry->block, __entry->logical, 791 __entry->len, __entry->block, __entry->logical,
723 __entry->goal, __entry->lleft, __entry->lright, 792 __entry->goal, __entry->lleft, __entry->lright,
724 __entry->pleft, __entry->pright) 793 __entry->pleft, __entry->pright)
@@ -748,11 +817,11 @@ TRACE_EVENT(ext4_free_blocks,
748 __entry->mode = inode->i_mode; 817 __entry->mode = inode->i_mode;
749 ), 818 ),
750 819
751 TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", 820 TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s",
752 MAJOR(__entry->dev), MINOR(__entry->dev), 821 MAJOR(__entry->dev), MINOR(__entry->dev),
753 (unsigned long) __entry->ino, 822 (unsigned long) __entry->ino,
754 __entry->mode, __entry->block, __entry->count, 823 __entry->mode, __entry->block, __entry->count,
755 __entry->flags) 824 show_free_flags(__entry->flags))
756); 825);
757 826
758TRACE_EVENT(ext4_sync_file_enter, 827TRACE_EVENT(ext4_sync_file_enter,
@@ -903,7 +972,7 @@ TRACE_EVENT(ext4_mballoc_alloc,
903 ), 972 ),
904 973
905 TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " 974 TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
906 "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " 975 "result %u/%d/%u@%u blks %u grps %u cr %u flags %s "
907 "tail %u broken %u", 976 "tail %u broken %u",
908 MAJOR(__entry->dev), MINOR(__entry->dev), 977 MAJOR(__entry->dev), MINOR(__entry->dev),
909 (unsigned long) __entry->ino, 978 (unsigned long) __entry->ino,
@@ -914,7 +983,7 @@ TRACE_EVENT(ext4_mballoc_alloc,
914 __entry->result_group, __entry->result_start, 983 __entry->result_group, __entry->result_start,
915 __entry->result_len, __entry->result_logical, 984 __entry->result_len, __entry->result_logical,
916 __entry->found, __entry->groups, __entry->cr, 985 __entry->found, __entry->groups, __entry->cr,
917 __entry->flags, __entry->tail, 986 show_mballoc_flags(__entry->flags), __entry->tail,
918 __entry->buddy ? 1 << __entry->buddy : 0) 987 __entry->buddy ? 1 << __entry->buddy : 0)
919); 988);
920 989
@@ -1528,10 +1597,10 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
1528 __entry->flags = flags; 1597 __entry->flags = flags;
1529 ), 1598 ),
1530 1599
1531 TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u", 1600 TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s",
1532 MAJOR(__entry->dev), MINOR(__entry->dev), 1601 MAJOR(__entry->dev), MINOR(__entry->dev),
1533 (unsigned long) __entry->ino, 1602 (unsigned long) __entry->ino,
1534 __entry->lblk, __entry->len, __entry->flags) 1603 __entry->lblk, __entry->len, show_map_flags(__entry->flags))
1535); 1604);
1536 1605
1537DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, 1606DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
@@ -1549,47 +1618,53 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
1549); 1618);
1550 1619
1551DECLARE_EVENT_CLASS(ext4__map_blocks_exit, 1620DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
1552 TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1621 TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map,
1622 int ret),
1553 1623
1554 TP_ARGS(inode, map, ret), 1624 TP_ARGS(inode, flags, map, ret),
1555 1625
1556 TP_STRUCT__entry( 1626 TP_STRUCT__entry(
1557 __field( dev_t, dev ) 1627 __field( dev_t, dev )
1558 __field( ino_t, ino ) 1628 __field( ino_t, ino )
1629 __field( unsigned int, flags )
1559 __field( ext4_fsblk_t, pblk ) 1630 __field( ext4_fsblk_t, pblk )
1560 __field( ext4_lblk_t, lblk ) 1631 __field( ext4_lblk_t, lblk )
1561 __field( unsigned int, len ) 1632 __field( unsigned int, len )
1562 __field( unsigned int, flags ) 1633 __field( unsigned int, mflags )
1563 __field( int, ret ) 1634 __field( int, ret )
1564 ), 1635 ),
1565 1636
1566 TP_fast_assign( 1637 TP_fast_assign(
1567 __entry->dev = inode->i_sb->s_dev; 1638 __entry->dev = inode->i_sb->s_dev;
1568 __entry->ino = inode->i_ino; 1639 __entry->ino = inode->i_ino;
1640 __entry->flags = flags;
1569 __entry->pblk = map->m_pblk; 1641 __entry->pblk = map->m_pblk;
1570 __entry->lblk = map->m_lblk; 1642 __entry->lblk = map->m_lblk;
1571 __entry->len = map->m_len; 1643 __entry->len = map->m_len;
1572 __entry->flags = map->m_flags; 1644 __entry->mflags = map->m_flags;
1573 __entry->ret = ret; 1645 __entry->ret = ret;
1574 ), 1646 ),
1575 1647
1576 TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d", 1648 TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u "
1649 "mflags %s ret %d",
1577 MAJOR(__entry->dev), MINOR(__entry->dev), 1650 MAJOR(__entry->dev), MINOR(__entry->dev),
1578 (unsigned long) __entry->ino, 1651 (unsigned long) __entry->ino,
1579 __entry->lblk, __entry->pblk, 1652 show_map_flags(__entry->flags), __entry->lblk, __entry->pblk,
1580 __entry->len, __entry->flags, __entry->ret) 1653 __entry->len, show_mflags(__entry->mflags), __entry->ret)
1581); 1654);
1582 1655
1583DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, 1656DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
1584 TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1657 TP_PROTO(struct inode *inode, unsigned flags,
1658 struct ext4_map_blocks *map, int ret),
1585 1659
1586 TP_ARGS(inode, map, ret) 1660 TP_ARGS(inode, flags, map, ret)
1587); 1661);
1588 1662
1589DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, 1663DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
1590 TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1664 TP_PROTO(struct inode *inode, unsigned flags,
1665 struct ext4_map_blocks *map, int ret),
1591 1666
1592 TP_ARGS(inode, map, ret) 1667 TP_ARGS(inode, flags, map, ret)
1593); 1668);
1594 1669
1595TRACE_EVENT(ext4_ext_load_extent, 1670TRACE_EVENT(ext4_ext_load_extent,
@@ -1638,25 +1713,50 @@ TRACE_EVENT(ext4_load_inode,
1638); 1713);
1639 1714
1640TRACE_EVENT(ext4_journal_start, 1715TRACE_EVENT(ext4_journal_start,
1641 TP_PROTO(struct super_block *sb, int nblocks, unsigned long IP), 1716 TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
1717 unsigned long IP),
1642 1718
1643 TP_ARGS(sb, nblocks, IP), 1719 TP_ARGS(sb, blocks, rsv_blocks, IP),
1644 1720
1645 TP_STRUCT__entry( 1721 TP_STRUCT__entry(
1646 __field( dev_t, dev ) 1722 __field( dev_t, dev )
1647 __field(unsigned long, ip ) 1723 __field(unsigned long, ip )
1648 __field( int, nblocks ) 1724 __field( int, blocks )
1725 __field( int, rsv_blocks )
1649 ), 1726 ),
1650 1727
1651 TP_fast_assign( 1728 TP_fast_assign(
1652 __entry->dev = sb->s_dev; 1729 __entry->dev = sb->s_dev;
1653 __entry->ip = IP; 1730 __entry->ip = IP;
1654 __entry->nblocks = nblocks; 1731 __entry->blocks = blocks;
1732 __entry->rsv_blocks = rsv_blocks;
1655 ), 1733 ),
1656 1734
1657 TP_printk("dev %d,%d nblocks %d caller %pF", 1735 TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pF",
1658 MAJOR(__entry->dev), MINOR(__entry->dev), 1736 MAJOR(__entry->dev), MINOR(__entry->dev),
1659 __entry->nblocks, (void *)__entry->ip) 1737 __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip)
1738);
1739
1740TRACE_EVENT(ext4_journal_start_reserved,
1741 TP_PROTO(struct super_block *sb, int blocks, unsigned long IP),
1742
1743 TP_ARGS(sb, blocks, IP),
1744
1745 TP_STRUCT__entry(
1746 __field( dev_t, dev )
1747 __field(unsigned long, ip )
1748 __field( int, blocks )
1749 ),
1750
1751 TP_fast_assign(
1752 __entry->dev = sb->s_dev;
1753 __entry->ip = IP;
1754 __entry->blocks = blocks;
1755 ),
1756
1757 TP_printk("dev %d,%d blocks, %d caller %pF",
1758 MAJOR(__entry->dev), MINOR(__entry->dev),
1759 __entry->blocks, (void *)__entry->ip)
1660); 1760);
1661 1761
1662DECLARE_EVENT_CLASS(ext4__trim, 1762DECLARE_EVENT_CLASS(ext4__trim,
@@ -1736,12 +1836,12 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
1736 __entry->newblk = newblock; 1836 __entry->newblk = newblock;
1737 ), 1837 ),
1738 1838
1739 TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x " 1839 TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s "
1740 "allocated %d newblock %llu", 1840 "allocated %d newblock %llu",
1741 MAJOR(__entry->dev), MINOR(__entry->dev), 1841 MAJOR(__entry->dev), MINOR(__entry->dev),
1742 (unsigned long) __entry->ino, 1842 (unsigned long) __entry->ino,
1743 (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, 1843 (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
1744 __entry->len, __entry->flags, 1844 __entry->len, show_map_flags(__entry->flags),
1745 (unsigned int) __entry->allocated, 1845 (unsigned int) __entry->allocated,
1746 (unsigned long long) __entry->newblk) 1846 (unsigned long long) __entry->newblk)
1747); 1847);
@@ -1769,10 +1869,10 @@ TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
1769 __entry->ret = ret; 1869 __entry->ret = ret;
1770 ), 1870 ),
1771 1871
1772 TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d", 1872 TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d",
1773 MAJOR(__entry->dev), MINOR(__entry->dev), 1873 MAJOR(__entry->dev), MINOR(__entry->dev),
1774 __entry->lblk, (unsigned long long) __entry->pblk, 1874 __entry->lblk, (unsigned long long) __entry->pblk,
1775 __entry->len, __entry->flags, __entry->ret) 1875 __entry->len, show_mflags(__entry->flags), __entry->ret)
1776); 1876);
1777 1877
1778TRACE_EVENT(ext4_ext_put_in_cache, 1878TRACE_EVENT(ext4_ext_put_in_cache,
@@ -1926,7 +2026,7 @@ TRACE_EVENT(ext4_ext_show_extent,
1926TRACE_EVENT(ext4_remove_blocks, 2026TRACE_EVENT(ext4_remove_blocks,
1927 TP_PROTO(struct inode *inode, struct ext4_extent *ex, 2027 TP_PROTO(struct inode *inode, struct ext4_extent *ex,
1928 ext4_lblk_t from, ext4_fsblk_t to, 2028 ext4_lblk_t from, ext4_fsblk_t to,
1929 ext4_fsblk_t partial_cluster), 2029 long long partial_cluster),
1930 2030
1931 TP_ARGS(inode, ex, from, to, partial_cluster), 2031 TP_ARGS(inode, ex, from, to, partial_cluster),
1932 2032
@@ -1935,7 +2035,7 @@ TRACE_EVENT(ext4_remove_blocks,
1935 __field( ino_t, ino ) 2035 __field( ino_t, ino )
1936 __field( ext4_lblk_t, from ) 2036 __field( ext4_lblk_t, from )
1937 __field( ext4_lblk_t, to ) 2037 __field( ext4_lblk_t, to )
1938 __field( ext4_fsblk_t, partial ) 2038 __field( long long, partial )
1939 __field( ext4_fsblk_t, ee_pblk ) 2039 __field( ext4_fsblk_t, ee_pblk )
1940 __field( ext4_lblk_t, ee_lblk ) 2040 __field( ext4_lblk_t, ee_lblk )
1941 __field( unsigned short, ee_len ) 2041 __field( unsigned short, ee_len )
@@ -1953,7 +2053,7 @@ TRACE_EVENT(ext4_remove_blocks,
1953 ), 2053 ),
1954 2054
1955 TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" 2055 TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
1956 "from %u to %u partial_cluster %u", 2056 "from %u to %u partial_cluster %lld",
1957 MAJOR(__entry->dev), MINOR(__entry->dev), 2057 MAJOR(__entry->dev), MINOR(__entry->dev),
1958 (unsigned long) __entry->ino, 2058 (unsigned long) __entry->ino,
1959 (unsigned) __entry->ee_lblk, 2059 (unsigned) __entry->ee_lblk,
@@ -1961,19 +2061,20 @@ TRACE_EVENT(ext4_remove_blocks,
1961 (unsigned short) __entry->ee_len, 2061 (unsigned short) __entry->ee_len,
1962 (unsigned) __entry->from, 2062 (unsigned) __entry->from,
1963 (unsigned) __entry->to, 2063 (unsigned) __entry->to,
1964 (unsigned) __entry->partial) 2064 (long long) __entry->partial)
1965); 2065);
1966 2066
1967TRACE_EVENT(ext4_ext_rm_leaf, 2067TRACE_EVENT(ext4_ext_rm_leaf,
1968 TP_PROTO(struct inode *inode, ext4_lblk_t start, 2068 TP_PROTO(struct inode *inode, ext4_lblk_t start,
1969 struct ext4_extent *ex, ext4_fsblk_t partial_cluster), 2069 struct ext4_extent *ex,
2070 long long partial_cluster),
1970 2071
1971 TP_ARGS(inode, start, ex, partial_cluster), 2072 TP_ARGS(inode, start, ex, partial_cluster),
1972 2073
1973 TP_STRUCT__entry( 2074 TP_STRUCT__entry(
1974 __field( dev_t, dev ) 2075 __field( dev_t, dev )
1975 __field( ino_t, ino ) 2076 __field( ino_t, ino )
1976 __field( ext4_fsblk_t, partial ) 2077 __field( long long, partial )
1977 __field( ext4_lblk_t, start ) 2078 __field( ext4_lblk_t, start )
1978 __field( ext4_lblk_t, ee_lblk ) 2079 __field( ext4_lblk_t, ee_lblk )
1979 __field( ext4_fsblk_t, ee_pblk ) 2080 __field( ext4_fsblk_t, ee_pblk )
@@ -1991,14 +2092,14 @@ TRACE_EVENT(ext4_ext_rm_leaf,
1991 ), 2092 ),
1992 2093
1993 TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" 2094 TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
1994 "partial_cluster %u", 2095 "partial_cluster %lld",
1995 MAJOR(__entry->dev), MINOR(__entry->dev), 2096 MAJOR(__entry->dev), MINOR(__entry->dev),
1996 (unsigned long) __entry->ino, 2097 (unsigned long) __entry->ino,
1997 (unsigned) __entry->start, 2098 (unsigned) __entry->start,
1998 (unsigned) __entry->ee_lblk, 2099 (unsigned) __entry->ee_lblk,
1999 (unsigned long long) __entry->ee_pblk, 2100 (unsigned long long) __entry->ee_pblk,
2000 (unsigned short) __entry->ee_len, 2101 (unsigned short) __entry->ee_len,
2001 (unsigned) __entry->partial) 2102 (long long) __entry->partial)
2002); 2103);
2003 2104
2004TRACE_EVENT(ext4_ext_rm_idx, 2105TRACE_EVENT(ext4_ext_rm_idx,
@@ -2025,14 +2126,16 @@ TRACE_EVENT(ext4_ext_rm_idx,
2025); 2126);
2026 2127
2027TRACE_EVENT(ext4_ext_remove_space, 2128TRACE_EVENT(ext4_ext_remove_space,
2028 TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth), 2129 TP_PROTO(struct inode *inode, ext4_lblk_t start,
2130 ext4_lblk_t end, int depth),
2029 2131
2030 TP_ARGS(inode, start, depth), 2132 TP_ARGS(inode, start, end, depth),
2031 2133
2032 TP_STRUCT__entry( 2134 TP_STRUCT__entry(
2033 __field( dev_t, dev ) 2135 __field( dev_t, dev )
2034 __field( ino_t, ino ) 2136 __field( ino_t, ino )
2035 __field( ext4_lblk_t, start ) 2137 __field( ext4_lblk_t, start )
2138 __field( ext4_lblk_t, end )
2036 __field( int, depth ) 2139 __field( int, depth )
2037 ), 2140 ),
2038 2141
@@ -2040,28 +2143,31 @@ TRACE_EVENT(ext4_ext_remove_space,
2040 __entry->dev = inode->i_sb->s_dev; 2143 __entry->dev = inode->i_sb->s_dev;
2041 __entry->ino = inode->i_ino; 2144 __entry->ino = inode->i_ino;
2042 __entry->start = start; 2145 __entry->start = start;
2146 __entry->end = end;
2043 __entry->depth = depth; 2147 __entry->depth = depth;
2044 ), 2148 ),
2045 2149
2046 TP_printk("dev %d,%d ino %lu since %u depth %d", 2150 TP_printk("dev %d,%d ino %lu since %u end %u depth %d",
2047 MAJOR(__entry->dev), MINOR(__entry->dev), 2151 MAJOR(__entry->dev), MINOR(__entry->dev),
2048 (unsigned long) __entry->ino, 2152 (unsigned long) __entry->ino,
2049 (unsigned) __entry->start, 2153 (unsigned) __entry->start,
2154 (unsigned) __entry->end,
2050 __entry->depth) 2155 __entry->depth)
2051); 2156);
2052 2157
2053TRACE_EVENT(ext4_ext_remove_space_done, 2158TRACE_EVENT(ext4_ext_remove_space_done,
2054 TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth, 2159 TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
2055 ext4_lblk_t partial, __le16 eh_entries), 2160 int depth, long long partial, __le16 eh_entries),
2056 2161
2057 TP_ARGS(inode, start, depth, partial, eh_entries), 2162 TP_ARGS(inode, start, end, depth, partial, eh_entries),
2058 2163
2059 TP_STRUCT__entry( 2164 TP_STRUCT__entry(
2060 __field( dev_t, dev ) 2165 __field( dev_t, dev )
2061 __field( ino_t, ino ) 2166 __field( ino_t, ino )
2062 __field( ext4_lblk_t, start ) 2167 __field( ext4_lblk_t, start )
2168 __field( ext4_lblk_t, end )
2063 __field( int, depth ) 2169 __field( int, depth )
2064 __field( ext4_lblk_t, partial ) 2170 __field( long long, partial )
2065 __field( unsigned short, eh_entries ) 2171 __field( unsigned short, eh_entries )
2066 ), 2172 ),
2067 2173
@@ -2069,18 +2175,20 @@ TRACE_EVENT(ext4_ext_remove_space_done,
2069 __entry->dev = inode->i_sb->s_dev; 2175 __entry->dev = inode->i_sb->s_dev;
2070 __entry->ino = inode->i_ino; 2176 __entry->ino = inode->i_ino;
2071 __entry->start = start; 2177 __entry->start = start;
2178 __entry->end = end;
2072 __entry->depth = depth; 2179 __entry->depth = depth;
2073 __entry->partial = partial; 2180 __entry->partial = partial;
2074 __entry->eh_entries = le16_to_cpu(eh_entries); 2181 __entry->eh_entries = le16_to_cpu(eh_entries);
2075 ), 2182 ),
2076 2183
2077 TP_printk("dev %d,%d ino %lu since %u depth %d partial %u " 2184 TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld "
2078 "remaining_entries %u", 2185 "remaining_entries %u",
2079 MAJOR(__entry->dev), MINOR(__entry->dev), 2186 MAJOR(__entry->dev), MINOR(__entry->dev),
2080 (unsigned long) __entry->ino, 2187 (unsigned long) __entry->ino,
2081 (unsigned) __entry->start, 2188 (unsigned) __entry->start,
2189 (unsigned) __entry->end,
2082 __entry->depth, 2190 __entry->depth,
2083 (unsigned) __entry->partial, 2191 (long long) __entry->partial,
2084 (unsigned short) __entry->eh_entries) 2192 (unsigned short) __entry->eh_entries)
2085); 2193);
2086 2194
@@ -2095,7 +2203,7 @@ TRACE_EVENT(ext4_es_insert_extent,
2095 __field( ext4_lblk_t, lblk ) 2203 __field( ext4_lblk_t, lblk )
2096 __field( ext4_lblk_t, len ) 2204 __field( ext4_lblk_t, len )
2097 __field( ext4_fsblk_t, pblk ) 2205 __field( ext4_fsblk_t, pblk )
2098 __field( unsigned long long, status ) 2206 __field( char, status )
2099 ), 2207 ),
2100 2208
2101 TP_fast_assign( 2209 TP_fast_assign(
@@ -2104,14 +2212,14 @@ TRACE_EVENT(ext4_es_insert_extent,
2104 __entry->lblk = es->es_lblk; 2212 __entry->lblk = es->es_lblk;
2105 __entry->len = es->es_len; 2213 __entry->len = es->es_len;
2106 __entry->pblk = ext4_es_pblock(es); 2214 __entry->pblk = ext4_es_pblock(es);
2107 __entry->status = ext4_es_status(es); 2215 __entry->status = ext4_es_status(es) >> 60;
2108 ), 2216 ),
2109 2217
2110 TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx", 2218 TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
2111 MAJOR(__entry->dev), MINOR(__entry->dev), 2219 MAJOR(__entry->dev), MINOR(__entry->dev),
2112 (unsigned long) __entry->ino, 2220 (unsigned long) __entry->ino,
2113 __entry->lblk, __entry->len, 2221 __entry->lblk, __entry->len,
2114 __entry->pblk, __entry->status) 2222 __entry->pblk, show_extent_status(__entry->status))
2115); 2223);
2116 2224
2117TRACE_EVENT(ext4_es_remove_extent, 2225TRACE_EVENT(ext4_es_remove_extent,
@@ -2172,7 +2280,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_exit,
2172 __field( ext4_lblk_t, lblk ) 2280 __field( ext4_lblk_t, lblk )
2173 __field( ext4_lblk_t, len ) 2281 __field( ext4_lblk_t, len )
2174 __field( ext4_fsblk_t, pblk ) 2282 __field( ext4_fsblk_t, pblk )
2175 __field( unsigned long long, status ) 2283 __field( char, status )
2176 ), 2284 ),
2177 2285
2178 TP_fast_assign( 2286 TP_fast_assign(
@@ -2181,14 +2289,14 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_exit,
2181 __entry->lblk = es->es_lblk; 2289 __entry->lblk = es->es_lblk;
2182 __entry->len = es->es_len; 2290 __entry->len = es->es_len;
2183 __entry->pblk = ext4_es_pblock(es); 2291 __entry->pblk = ext4_es_pblock(es);
2184 __entry->status = ext4_es_status(es); 2292 __entry->status = ext4_es_status(es) >> 60;
2185 ), 2293 ),
2186 2294
2187 TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx", 2295 TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
2188 MAJOR(__entry->dev), MINOR(__entry->dev), 2296 MAJOR(__entry->dev), MINOR(__entry->dev),
2189 (unsigned long) __entry->ino, 2297 (unsigned long) __entry->ino,
2190 __entry->lblk, __entry->len, 2298 __entry->lblk, __entry->len,
2191 __entry->pblk, __entry->status) 2299 __entry->pblk, show_extent_status(__entry->status))
2192); 2300);
2193 2301
2194TRACE_EVENT(ext4_es_lookup_extent_enter, 2302TRACE_EVENT(ext4_es_lookup_extent_enter,
@@ -2225,7 +2333,7 @@ TRACE_EVENT(ext4_es_lookup_extent_exit,
2225 __field( ext4_lblk_t, lblk ) 2333 __field( ext4_lblk_t, lblk )
2226 __field( ext4_lblk_t, len ) 2334 __field( ext4_lblk_t, len )
2227 __field( ext4_fsblk_t, pblk ) 2335 __field( ext4_fsblk_t, pblk )
2228 __field( unsigned long long, status ) 2336 __field( char, status )
2229 __field( int, found ) 2337 __field( int, found )
2230 ), 2338 ),
2231 2339
@@ -2235,16 +2343,16 @@ TRACE_EVENT(ext4_es_lookup_extent_exit,
2235 __entry->lblk = es->es_lblk; 2343 __entry->lblk = es->es_lblk;
2236 __entry->len = es->es_len; 2344 __entry->len = es->es_len;
2237 __entry->pblk = ext4_es_pblock(es); 2345 __entry->pblk = ext4_es_pblock(es);
2238 __entry->status = ext4_es_status(es); 2346 __entry->status = ext4_es_status(es) >> 60;
2239 __entry->found = found; 2347 __entry->found = found;
2240 ), 2348 ),
2241 2349
2242 TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %llx", 2350 TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s",
2243 MAJOR(__entry->dev), MINOR(__entry->dev), 2351 MAJOR(__entry->dev), MINOR(__entry->dev),
2244 (unsigned long) __entry->ino, __entry->found, 2352 (unsigned long) __entry->ino, __entry->found,
2245 __entry->lblk, __entry->len, 2353 __entry->lblk, __entry->len,
2246 __entry->found ? __entry->pblk : 0, 2354 __entry->found ? __entry->pblk : 0,
2247 __entry->found ? __entry->status : 0) 2355 show_extent_status(__entry->found ? __entry->status : 0))
2248); 2356);
2249 2357
2250TRACE_EVENT(ext4_es_shrink_enter, 2358TRACE_EVENT(ext4_es_shrink_enter,
diff --git a/mm/readahead.c b/mm/readahead.c
index daed28dd5830..829a77c62834 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -48,7 +48,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,
48 if (!trylock_page(page)) 48 if (!trylock_page(page))
49 BUG(); 49 BUG();
50 page->mapping = mapping; 50 page->mapping = mapping;
51 do_invalidatepage(page, 0); 51 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
52 page->mapping = NULL; 52 page->mapping = NULL;
53 unlock_page(page); 53 unlock_page(page);
54 } 54 }
diff --git a/mm/truncate.c b/mm/truncate.c
index c75b736e54b7..e2e8a8a7eb9d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -26,7 +26,8 @@
26/** 26/**
27 * do_invalidatepage - invalidate part or all of a page 27 * do_invalidatepage - invalidate part or all of a page
28 * @page: the page which is affected 28 * @page: the page which is affected
29 * @offset: the index of the truncation point 29 * @offset: start of the range to invalidate
30 * @length: length of the range to invalidate
30 * 31 *
31 * do_invalidatepage() is called when all or part of the page has become 32 * do_invalidatepage() is called when all or part of the page has become
32 * invalidated by a truncate operation. 33 * invalidated by a truncate operation.
@@ -37,24 +38,18 @@
37 * point. Because the caller is about to free (and possibly reuse) those 38 * point. Because the caller is about to free (and possibly reuse) those
38 * blocks on-disk. 39 * blocks on-disk.
39 */ 40 */
40void do_invalidatepage(struct page *page, unsigned long offset) 41void do_invalidatepage(struct page *page, unsigned int offset,
42 unsigned int length)
41{ 43{
42 void (*invalidatepage)(struct page *, unsigned long); 44 void (*invalidatepage)(struct page *, unsigned int, unsigned int);
45
43 invalidatepage = page->mapping->a_ops->invalidatepage; 46 invalidatepage = page->mapping->a_ops->invalidatepage;
44#ifdef CONFIG_BLOCK 47#ifdef CONFIG_BLOCK
45 if (!invalidatepage) 48 if (!invalidatepage)
46 invalidatepage = block_invalidatepage; 49 invalidatepage = block_invalidatepage;
47#endif 50#endif
48 if (invalidatepage) 51 if (invalidatepage)
49 (*invalidatepage)(page, offset); 52 (*invalidatepage)(page, offset, length);
50}
51
52static inline void truncate_partial_page(struct page *page, unsigned partial)
53{
54 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
55 cleancache_invalidate_page(page->mapping, page);
56 if (page_has_private(page))
57 do_invalidatepage(page, partial);
58} 53}
59 54
60/* 55/*
@@ -103,7 +98,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
103 return -EIO; 98 return -EIO;
104 99
105 if (page_has_private(page)) 100 if (page_has_private(page))
106 do_invalidatepage(page, 0); 101 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
107 102
108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 103 cancel_dirty_page(page, PAGE_CACHE_SIZE);
109 104
@@ -185,11 +180,11 @@ int invalidate_inode_page(struct page *page)
185 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets 180 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
186 * @mapping: mapping to truncate 181 * @mapping: mapping to truncate
187 * @lstart: offset from which to truncate 182 * @lstart: offset from which to truncate
188 * @lend: offset to which to truncate 183 * @lend: offset to which to truncate (inclusive)
189 * 184 *
190 * Truncate the page cache, removing the pages that are between 185 * Truncate the page cache, removing the pages that are between
191 * specified offsets (and zeroing out partial page 186 * specified offsets (and zeroing out partial pages
192 * (if lstart is not page aligned)). 187 * if lstart or lend + 1 is not page aligned).
193 * 188 *
194 * Truncate takes two passes - the first pass is nonblocking. It will not 189 * Truncate takes two passes - the first pass is nonblocking. It will not
195 * block on page locks and it will not block on writeback. The second pass 190 * block on page locks and it will not block on writeback. The second pass
@@ -200,35 +195,58 @@ int invalidate_inode_page(struct page *page)
200 * We pass down the cache-hot hint to the page freeing code. Even if the 195 * We pass down the cache-hot hint to the page freeing code. Even if the
201 * mapping is large, it is probably the case that the final pages are the most 196 * mapping is large, it is probably the case that the final pages are the most
202 * recently touched, and freeing happens in ascending file offset order. 197 * recently touched, and freeing happens in ascending file offset order.
198 *
199 * Note that since ->invalidatepage() accepts range to invalidate
200 * truncate_inode_pages_range is able to handle cases where lend + 1 is not
201 * page aligned properly.
203 */ 202 */
204void truncate_inode_pages_range(struct address_space *mapping, 203void truncate_inode_pages_range(struct address_space *mapping,
205 loff_t lstart, loff_t lend) 204 loff_t lstart, loff_t lend)
206{ 205{
207 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 206 pgoff_t start; /* inclusive */
208 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 207 pgoff_t end; /* exclusive */
209 struct pagevec pvec; 208 unsigned int partial_start; /* inclusive */
210 pgoff_t index; 209 unsigned int partial_end; /* exclusive */
211 pgoff_t end; 210 struct pagevec pvec;
212 int i; 211 pgoff_t index;
212 int i;
213 213
214 cleancache_invalidate_inode(mapping); 214 cleancache_invalidate_inode(mapping);
215 if (mapping->nrpages == 0) 215 if (mapping->nrpages == 0)
216 return; 216 return;
217 217
218 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 218 /* Offsets within partial pages */
219 end = (lend >> PAGE_CACHE_SHIFT); 219 partial_start = lstart & (PAGE_CACHE_SIZE - 1);
220 partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
221
222 /*
223 * 'start' and 'end' always covers the range of pages to be fully
224 * truncated. Partial pages are covered with 'partial_start' at the
225 * start of the range and 'partial_end' at the end of the range.
226 * Note that 'end' is exclusive while 'lend' is inclusive.
227 */
228 start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
229 if (lend == -1)
230 /*
231 * lend == -1 indicates end-of-file so we have to set 'end'
232 * to the highest possible pgoff_t and since the type is
233 * unsigned we're using -1.
234 */
235 end = -1;
236 else
237 end = (lend + 1) >> PAGE_CACHE_SHIFT;
220 238
221 pagevec_init(&pvec, 0); 239 pagevec_init(&pvec, 0);
222 index = start; 240 index = start;
223 while (index <= end && pagevec_lookup(&pvec, mapping, index, 241 while (index < end && pagevec_lookup(&pvec, mapping, index,
224 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 242 min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
225 mem_cgroup_uncharge_start(); 243 mem_cgroup_uncharge_start();
226 for (i = 0; i < pagevec_count(&pvec); i++) { 244 for (i = 0; i < pagevec_count(&pvec); i++) {
227 struct page *page = pvec.pages[i]; 245 struct page *page = pvec.pages[i];
228 246
229 /* We rely upon deletion not changing page->index */ 247 /* We rely upon deletion not changing page->index */
230 index = page->index; 248 index = page->index;
231 if (index > end) 249 if (index >= end)
232 break; 250 break;
233 251
234 if (!trylock_page(page)) 252 if (!trylock_page(page))
@@ -247,27 +265,56 @@ void truncate_inode_pages_range(struct address_space *mapping,
247 index++; 265 index++;
248 } 266 }
249 267
250 if (partial) { 268 if (partial_start) {
251 struct page *page = find_lock_page(mapping, start - 1); 269 struct page *page = find_lock_page(mapping, start - 1);
252 if (page) { 270 if (page) {
271 unsigned int top = PAGE_CACHE_SIZE;
272 if (start > end) {
273 /* Truncation within a single page */
274 top = partial_end;
275 partial_end = 0;
276 }
253 wait_on_page_writeback(page); 277 wait_on_page_writeback(page);
254 truncate_partial_page(page, partial); 278 zero_user_segment(page, partial_start, top);
279 cleancache_invalidate_page(mapping, page);
280 if (page_has_private(page))
281 do_invalidatepage(page, partial_start,
282 top - partial_start);
255 unlock_page(page); 283 unlock_page(page);
256 page_cache_release(page); 284 page_cache_release(page);
257 } 285 }
258 } 286 }
287 if (partial_end) {
288 struct page *page = find_lock_page(mapping, end);
289 if (page) {
290 wait_on_page_writeback(page);
291 zero_user_segment(page, 0, partial_end);
292 cleancache_invalidate_page(mapping, page);
293 if (page_has_private(page))
294 do_invalidatepage(page, 0,
295 partial_end);
296 unlock_page(page);
297 page_cache_release(page);
298 }
299 }
300 /*
301 * If the truncation happened within a single page no pages
302 * will be released, just zeroed, so we can bail out now.
303 */
304 if (start >= end)
305 return;
259 306
260 index = start; 307 index = start;
261 for ( ; ; ) { 308 for ( ; ; ) {
262 cond_resched(); 309 cond_resched();
263 if (!pagevec_lookup(&pvec, mapping, index, 310 if (!pagevec_lookup(&pvec, mapping, index,
264 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 311 min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
265 if (index == start) 312 if (index == start)
266 break; 313 break;
267 index = start; 314 index = start;
268 continue; 315 continue;
269 } 316 }
270 if (index == start && pvec.pages[0]->index > end) { 317 if (index == start && pvec.pages[0]->index >= end) {
271 pagevec_release(&pvec); 318 pagevec_release(&pvec);
272 break; 319 break;
273 } 320 }
@@ -277,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
277 324
278 /* We rely upon deletion not changing page->index */ 325 /* We rely upon deletion not changing page->index */
279 index = page->index; 326 index = page->index;
280 if (index > end) 327 if (index >= end)
281 break; 328 break;
282 329
283 lock_page(page); 330 lock_page(page);
@@ -598,10 +645,8 @@ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
598 * This rounding is currently just for example: unmap_mapping_range 645 * This rounding is currently just for example: unmap_mapping_range
599 * expands its hole outwards, whereas we want it to contract the hole 646 * expands its hole outwards, whereas we want it to contract the hole
600 * inwards. However, existing callers of truncate_pagecache_range are 647 * inwards. However, existing callers of truncate_pagecache_range are
601 * doing their own page rounding first; and truncate_inode_pages_range 648 * doing their own page rounding first. Note that unmap_mapping_range
602 * currently BUGs if lend is not pagealigned-1 (it handles partial 649 * allows holelen 0 for all, and we allow lend -1 for end of file.
603 * page at start of hole, but not partial page at end of hole). Note
604 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
605 */ 650 */
606 651
607 /* 652 /*