diff options
63 files changed, 2652 insertions, 2170 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index bdd82b2339d9..9858f337529c 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -189,7 +189,7 @@ prototypes: | |||
189 | loff_t pos, unsigned len, unsigned copied, | 189 | loff_t pos, unsigned len, unsigned copied, |
190 | struct page *page, void *fsdata); | 190 | struct page *page, void *fsdata); |
191 | sector_t (*bmap)(struct address_space *, sector_t); | 191 | sector_t (*bmap)(struct address_space *, sector_t); |
192 | int (*invalidatepage) (struct page *, unsigned long); | 192 | void (*invalidatepage) (struct page *, unsigned int, unsigned int); |
193 | int (*releasepage) (struct page *, int); | 193 | int (*releasepage) (struct page *, int); |
194 | void (*freepage)(struct page *); | 194 | void (*freepage)(struct page *); |
195 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 195 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
@@ -310,8 +310,8 @@ filesystems and by the swapper. The latter will eventually go away. Please, | |||
310 | keep it that way and don't breed new callers. | 310 | keep it that way and don't breed new callers. |
311 | 311 | ||
312 | ->invalidatepage() is called when the filesystem must attempt to drop | 312 | ->invalidatepage() is called when the filesystem must attempt to drop |
313 | some or all of the buffers from the page when it is being truncated. It | 313 | some or all of the buffers from the page when it is being truncated. It |
314 | returns zero on success. If ->invalidatepage is zero, the kernel uses | 314 | returns zero on success. If ->invalidatepage is zero, the kernel uses |
315 | block_invalidatepage() instead. | 315 | block_invalidatepage() instead. |
316 | 316 | ||
317 | ->releasepage() is called when the kernel is about to try to drop the | 317 | ->releasepage() is called when the kernel is about to try to drop the |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 4a35f6614a66..e6bd1ffd821e 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -549,7 +549,7 @@ struct address_space_operations | |||
549 | ------------------------------- | 549 | ------------------------------- |
550 | 550 | ||
551 | This describes how the VFS can manipulate mapping of a file to page cache in | 551 | This describes how the VFS can manipulate mapping of a file to page cache in |
552 | your filesystem. As of kernel 2.6.22, the following members are defined: | 552 | your filesystem. The following members are defined: |
553 | 553 | ||
554 | struct address_space_operations { | 554 | struct address_space_operations { |
555 | int (*writepage)(struct page *page, struct writeback_control *wbc); | 555 | int (*writepage)(struct page *page, struct writeback_control *wbc); |
@@ -566,7 +566,7 @@ struct address_space_operations { | |||
566 | loff_t pos, unsigned len, unsigned copied, | 566 | loff_t pos, unsigned len, unsigned copied, |
567 | struct page *page, void *fsdata); | 567 | struct page *page, void *fsdata); |
568 | sector_t (*bmap)(struct address_space *, sector_t); | 568 | sector_t (*bmap)(struct address_space *, sector_t); |
569 | int (*invalidatepage) (struct page *, unsigned long); | 569 | void (*invalidatepage) (struct page *, unsigned int, unsigned int); |
570 | int (*releasepage) (struct page *, int); | 570 | int (*releasepage) (struct page *, int); |
571 | void (*freepage)(struct page *); | 571 | void (*freepage)(struct page *); |
572 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 572 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
@@ -685,14 +685,14 @@ struct address_space_operations { | |||
685 | invalidatepage: If a page has PagePrivate set, then invalidatepage | 685 | invalidatepage: If a page has PagePrivate set, then invalidatepage |
686 | will be called when part or all of the page is to be removed | 686 | will be called when part or all of the page is to be removed |
687 | from the address space. This generally corresponds to either a | 687 | from the address space. This generally corresponds to either a |
688 | truncation or a complete invalidation of the address space | 688 | truncation, punch hole or a complete invalidation of the address |
689 | (in the latter case 'offset' will always be 0). | 689 | space (in the latter case 'offset' will always be 0 and 'length' |
690 | Any private data associated with the page should be updated | 690 | will be PAGE_CACHE_SIZE). Any private data associated with the page |
691 | to reflect this truncation. If offset is 0, then | 691 | should be updated to reflect this truncation. If offset is 0 and |
692 | the private data should be released, because the page | 692 | length is PAGE_CACHE_SIZE, then the private data should be released, |
693 | must be able to be completely discarded. This may be done by | 693 | because the page must be able to be completely discarded. This may |
694 | calling the ->releasepage function, but in this case the | 694 | be done by calling the ->releasepage function, but in this case the |
695 | release MUST succeed. | 695 | release MUST succeed. |
696 | 696 | ||
697 | releasepage: releasepage is called on PagePrivate pages to indicate | 697 | releasepage: releasepage is called on PagePrivate pages to indicate |
698 | that the page should be freed if possible. ->releasepage | 698 | that the page should be freed if possible. ->releasepage |
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 055562c580b4..9ff073f4090a 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c | |||
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) | |||
148 | * @offset: offset in the page | 148 | * @offset: offset in the page |
149 | */ | 149 | */ |
150 | 150 | ||
151 | static void v9fs_invalidate_page(struct page *page, unsigned long offset) | 151 | static void v9fs_invalidate_page(struct page *page, unsigned int offset, |
152 | unsigned int length) | ||
152 | { | 153 | { |
153 | /* | 154 | /* |
154 | * If called with zero offset, we should release | 155 | * If called with zero offset, we should release |
155 | * the private state assocated with the page | 156 | * the private state assocated with the page |
156 | */ | 157 | */ |
157 | if (offset == 0) | 158 | if (offset == 0 && length == PAGE_CACHE_SIZE) |
158 | v9fs_fscache_invalidate_page(page); | 159 | v9fs_fscache_invalidate_page(page); |
159 | } | 160 | } |
160 | 161 | ||
diff --git a/fs/afs/file.c b/fs/afs/file.c index 8f6e9234d565..66d50fe2ee45 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c | |||
@@ -19,7 +19,8 @@ | |||
19 | #include "internal.h" | 19 | #include "internal.h" |
20 | 20 | ||
21 | static int afs_readpage(struct file *file, struct page *page); | 21 | static int afs_readpage(struct file *file, struct page *page); |
22 | static void afs_invalidatepage(struct page *page, unsigned long offset); | 22 | static void afs_invalidatepage(struct page *page, unsigned int offset, |
23 | unsigned int length); | ||
23 | static int afs_releasepage(struct page *page, gfp_t gfp_flags); | 24 | static int afs_releasepage(struct page *page, gfp_t gfp_flags); |
24 | static int afs_launder_page(struct page *page); | 25 | static int afs_launder_page(struct page *page); |
25 | 26 | ||
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page) | |||
310 | * - release a page and clean up its private data if offset is 0 (indicating | 311 | * - release a page and clean up its private data if offset is 0 (indicating |
311 | * the entire page) | 312 | * the entire page) |
312 | */ | 313 | */ |
313 | static void afs_invalidatepage(struct page *page, unsigned long offset) | 314 | static void afs_invalidatepage(struct page *page, unsigned int offset, |
315 | unsigned int length) | ||
314 | { | 316 | { |
315 | struct afs_writeback *wb = (struct afs_writeback *) page_private(page); | 317 | struct afs_writeback *wb = (struct afs_writeback *) page_private(page); |
316 | 318 | ||
317 | _enter("{%lu},%lu", page->index, offset); | 319 | _enter("{%lu},%u,%u", page->index, offset, length); |
318 | 320 | ||
319 | BUG_ON(!PageLocked(page)); | 321 | BUG_ON(!PageLocked(page)); |
320 | 322 | ||
321 | /* we clean up only if the entire page is being invalidated */ | 323 | /* we clean up only if the entire page is being invalidated */ |
322 | if (offset == 0) { | 324 | if (offset == 0 && length == PAGE_CACHE_SIZE) { |
323 | #ifdef CONFIG_AFS_FSCACHE | 325 | #ifdef CONFIG_AFS_FSCACHE |
324 | if (PageFsCache(page)) { | 326 | if (PageFsCache(page)) { |
325 | struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); | 327 | struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b8b60b660c8f..b0292b3ead54 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) | |||
1013 | return try_release_extent_buffer(page); | 1013 | return try_release_extent_buffer(page); |
1014 | } | 1014 | } |
1015 | 1015 | ||
1016 | static void btree_invalidatepage(struct page *page, unsigned long offset) | 1016 | static void btree_invalidatepage(struct page *page, unsigned int offset, |
1017 | unsigned int length) | ||
1017 | { | 1018 | { |
1018 | struct extent_io_tree *tree; | 1019 | struct extent_io_tree *tree; |
1019 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 1020 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e7e7afb4a872..6bca9472f313 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -2957,7 +2957,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2957 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); | 2957 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); |
2958 | if (page->index > end_index || | 2958 | if (page->index > end_index || |
2959 | (page->index == end_index && !pg_offset)) { | 2959 | (page->index == end_index && !pg_offset)) { |
2960 | page->mapping->a_ops->invalidatepage(page, 0); | 2960 | page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); |
2961 | unlock_page(page); | 2961 | unlock_page(page); |
2962 | return 0; | 2962 | return 0; |
2963 | } | 2963 | } |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a46b656d08de..4f9d16b70d3d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -7493,7 +7493,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) | |||
7493 | return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); | 7493 | return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); |
7494 | } | 7494 | } |
7495 | 7495 | ||
7496 | static void btrfs_invalidatepage(struct page *page, unsigned long offset) | 7496 | static void btrfs_invalidatepage(struct page *page, unsigned int offset, |
7497 | unsigned int length) | ||
7497 | { | 7498 | { |
7498 | struct inode *inode = page->mapping->host; | 7499 | struct inode *inode = page->mapping->host; |
7499 | struct extent_io_tree *tree; | 7500 | struct extent_io_tree *tree; |
diff --git a/fs/buffer.c b/fs/buffer.c index d2a4d1bb2d57..f93392e2df12 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1454,7 +1454,8 @@ static void discard_buffer(struct buffer_head * bh) | |||
1454 | * block_invalidatepage - invalidate part or all of a buffer-backed page | 1454 | * block_invalidatepage - invalidate part or all of a buffer-backed page |
1455 | * | 1455 | * |
1456 | * @page: the page which is affected | 1456 | * @page: the page which is affected |
1457 | * @offset: the index of the truncation point | 1457 | * @offset: start of the range to invalidate |
1458 | * @length: length of the range to invalidate | ||
1458 | * | 1459 | * |
1459 | * block_invalidatepage() is called when all or part of the page has become | 1460 | * block_invalidatepage() is called when all or part of the page has become |
1460 | * invalidated by a truncate operation. | 1461 | * invalidated by a truncate operation. |
@@ -1465,15 +1466,22 @@ static void discard_buffer(struct buffer_head * bh) | |||
1465 | * point. Because the caller is about to free (and possibly reuse) those | 1466 | * point. Because the caller is about to free (and possibly reuse) those |
1466 | * blocks on-disk. | 1467 | * blocks on-disk. |
1467 | */ | 1468 | */ |
1468 | void block_invalidatepage(struct page *page, unsigned long offset) | 1469 | void block_invalidatepage(struct page *page, unsigned int offset, |
1470 | unsigned int length) | ||
1469 | { | 1471 | { |
1470 | struct buffer_head *head, *bh, *next; | 1472 | struct buffer_head *head, *bh, *next; |
1471 | unsigned int curr_off = 0; | 1473 | unsigned int curr_off = 0; |
1474 | unsigned int stop = length + offset; | ||
1472 | 1475 | ||
1473 | BUG_ON(!PageLocked(page)); | 1476 | BUG_ON(!PageLocked(page)); |
1474 | if (!page_has_buffers(page)) | 1477 | if (!page_has_buffers(page)) |
1475 | goto out; | 1478 | goto out; |
1476 | 1479 | ||
1480 | /* | ||
1481 | * Check for overflow | ||
1482 | */ | ||
1483 | BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); | ||
1484 | |||
1477 | head = page_buffers(page); | 1485 | head = page_buffers(page); |
1478 | bh = head; | 1486 | bh = head; |
1479 | do { | 1487 | do { |
@@ -1481,6 +1489,12 @@ void block_invalidatepage(struct page *page, unsigned long offset) | |||
1481 | next = bh->b_this_page; | 1489 | next = bh->b_this_page; |
1482 | 1490 | ||
1483 | /* | 1491 | /* |
1492 | * Are we still fully in range ? | ||
1493 | */ | ||
1494 | if (next_off > stop) | ||
1495 | goto out; | ||
1496 | |||
1497 | /* | ||
1484 | * is this block fully invalidated? | 1498 | * is this block fully invalidated? |
1485 | */ | 1499 | */ |
1486 | if (offset <= curr_off) | 1500 | if (offset <= curr_off) |
@@ -1501,6 +1515,7 @@ out: | |||
1501 | } | 1515 | } |
1502 | EXPORT_SYMBOL(block_invalidatepage); | 1516 | EXPORT_SYMBOL(block_invalidatepage); |
1503 | 1517 | ||
1518 | |||
1504 | /* | 1519 | /* |
1505 | * We attach and possibly dirty the buffers atomically wrt | 1520 | * We attach and possibly dirty the buffers atomically wrt |
1506 | * __set_page_dirty_buffers() via private_lock. try_to_free_buffers | 1521 | * __set_page_dirty_buffers() via private_lock. try_to_free_buffers |
@@ -2841,7 +2856,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, | |||
2841 | * they may have been added in ext3_writepage(). Make them | 2856 | * they may have been added in ext3_writepage(). Make them |
2842 | * freeable here, so the page does not leak. | 2857 | * freeable here, so the page does not leak. |
2843 | */ | 2858 | */ |
2844 | do_invalidatepage(page, 0); | 2859 | do_invalidatepage(page, 0, PAGE_CACHE_SIZE); |
2845 | unlock_page(page); | 2860 | unlock_page(page); |
2846 | return 0; /* don't care */ | 2861 | return 0; /* don't care */ |
2847 | } | 2862 | } |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3e68ac101040..38b5c1bc6776 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page) | |||
143 | * dirty page counters appropriately. Only called if there is private | 143 | * dirty page counters appropriately. Only called if there is private |
144 | * data on the page. | 144 | * data on the page. |
145 | */ | 145 | */ |
146 | static void ceph_invalidatepage(struct page *page, unsigned long offset) | 146 | static void ceph_invalidatepage(struct page *page, unsigned int offset, |
147 | unsigned int length) | ||
147 | { | 148 | { |
148 | struct inode *inode; | 149 | struct inode *inode; |
149 | struct ceph_inode_info *ci; | 150 | struct ceph_inode_info *ci; |
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset) | |||
163 | if (!PageDirty(page)) | 164 | if (!PageDirty(page)) |
164 | pr_err("%p invalidatepage %p page not dirty\n", inode, page); | 165 | pr_err("%p invalidatepage %p page not dirty\n", inode, page); |
165 | 166 | ||
166 | if (offset == 0) | 167 | if (offset == 0 && length == PAGE_CACHE_SIZE) |
167 | ClearPageChecked(page); | 168 | ClearPageChecked(page); |
168 | 169 | ||
169 | ci = ceph_inode(inode); | 170 | ci = ceph_inode(inode); |
170 | if (offset == 0) { | 171 | if (offset == 0 && length == PAGE_CACHE_SIZE) { |
171 | dout("%p invalidatepage %p idx %lu full dirty page %lu\n", | 172 | dout("%p invalidatepage %p idx %lu full dirty page\n", |
172 | inode, page, page->index, offset); | 173 | inode, page, page->index); |
173 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); | 174 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); |
174 | ceph_put_snap_context(snapc); | 175 | ceph_put_snap_context(snapc); |
175 | page->private = 0; | 176 | page->private = 0; |
176 | ClearPagePrivate(page); | 177 | ClearPagePrivate(page); |
177 | } else { | 178 | } else { |
178 | dout("%p invalidatepage %p idx %lu partial dirty page\n", | 179 | dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n", |
179 | inode, page, page->index); | 180 | inode, page, page->index, offset, length); |
180 | } | 181 | } |
181 | } | 182 | } |
182 | 183 | ||
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 48b29d24c9f4..4d8ba8d491e5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
@@ -3546,11 +3546,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp) | |||
3546 | return cifs_fscache_release_page(page, gfp); | 3546 | return cifs_fscache_release_page(page, gfp); |
3547 | } | 3547 | } |
3548 | 3548 | ||
3549 | static void cifs_invalidate_page(struct page *page, unsigned long offset) | 3549 | static void cifs_invalidate_page(struct page *page, unsigned int offset, |
3550 | unsigned int length) | ||
3550 | { | 3551 | { |
3551 | struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); | 3552 | struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); |
3552 | 3553 | ||
3553 | if (offset == 0) | 3554 | if (offset == 0 && length == PAGE_CACHE_SIZE) |
3554 | cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); | 3555 | cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); |
3555 | } | 3556 | } |
3556 | 3557 | ||
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d1f80abd8828..2ec8eb1ab269 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp) | |||
953 | return 0; | 953 | return 0; |
954 | } | 954 | } |
955 | 955 | ||
956 | static void exofs_invalidatepage(struct page *page, unsigned long offset) | 956 | static void exofs_invalidatepage(struct page *page, unsigned int offset, |
957 | unsigned int length) | ||
957 | { | 958 | { |
958 | EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); | 959 | EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", |
960 | page->index, offset, length); | ||
959 | WARN_ON(1); | 961 | WARN_ON(1); |
960 | } | 962 | } |
961 | 963 | ||
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 23c712825640..f67668f724ba 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c | |||
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping, | |||
1825 | return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); | 1825 | return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); |
1826 | } | 1826 | } |
1827 | 1827 | ||
1828 | static void ext3_invalidatepage(struct page *page, unsigned long offset) | 1828 | static void ext3_invalidatepage(struct page *page, unsigned int offset, |
1829 | unsigned int length) | ||
1829 | { | 1830 | { |
1830 | journal_t *journal = EXT3_JOURNAL(page->mapping->host); | 1831 | journal_t *journal = EXT3_JOURNAL(page->mapping->host); |
1831 | 1832 | ||
1832 | trace_ext3_invalidatepage(page, offset); | 1833 | trace_ext3_invalidatepage(page, offset, length); |
1833 | 1834 | ||
1834 | /* | 1835 | /* |
1835 | * If it's a full truncate we just forget about the pending dirtying | 1836 | * If it's a full truncate we just forget about the pending dirtying |
1836 | */ | 1837 | */ |
1837 | if (offset == 0) | 1838 | if (offset == 0 && length == PAGE_CACHE_SIZE) |
1838 | ClearPageChecked(page); | 1839 | ClearPageChecked(page); |
1839 | 1840 | ||
1840 | journal_invalidatepage(journal, page, offset); | 1841 | journal_invalidatepage(journal, page, offset, length); |
1841 | } | 1842 | } |
1842 | 1843 | ||
1843 | static int ext3_releasepage(struct page *page, gfp_t wait) | 1844 | static int ext3_releasepage(struct page *page, gfp_t wait) |
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 692de13e3596..cea8ecf3e76e 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c | |||
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, | |||
576 | if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, | 576 | if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, |
577 | (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) | 577 | (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) |
578 | +((char *)de - bh->b_data))) { | 578 | +((char *)de - bh->b_data))) { |
579 | /* On error, skip the f_pos to the next block. */ | 579 | /* silently ignore the rest of the block */ |
580 | dir_file->f_pos = (dir_file->f_pos | | 580 | break; |
581 | (dir->i_sb->s_blocksize - 1)) + 1; | ||
582 | brelse (bh); | ||
583 | return count; | ||
584 | } | 581 | } |
585 | ext3fs_dirhash(de->name, de->name_len, hinfo); | 582 | ext3fs_dirhash(de->name, de->name_len, hinfo); |
586 | if ((hinfo->hash < start_hash) || | 583 | if ((hinfo->hash < start_hash) || |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d0f13eada0ed..58339393fa6e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) | |||
682 | 682 | ||
683 | static inline int test_root(ext4_group_t a, int b) | 683 | static inline int test_root(ext4_group_t a, int b) |
684 | { | 684 | { |
685 | int num = b; | 685 | while (1) { |
686 | 686 | if (a < b) | |
687 | while (a > num) | 687 | return 0; |
688 | num *= b; | 688 | if (a == b) |
689 | return num == a; | 689 | return 1; |
690 | if ((a % b) != 0) | ||
691 | return 0; | ||
692 | a = a / b; | ||
693 | } | ||
690 | } | 694 | } |
691 | 695 | ||
692 | static int ext4_group_sparse(ext4_group_t group) | 696 | static int ext4_group_sparse(ext4_group_t group) |
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4af03ea84aa3..b577e45425b0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -177,38 +177,28 @@ struct ext4_map_blocks { | |||
177 | }; | 177 | }; |
178 | 178 | ||
179 | /* | 179 | /* |
180 | * For delayed allocation tracking | ||
181 | */ | ||
182 | struct mpage_da_data { | ||
183 | struct inode *inode; | ||
184 | sector_t b_blocknr; /* start block number of extent */ | ||
185 | size_t b_size; /* size of extent */ | ||
186 | unsigned long b_state; /* state of the extent */ | ||
187 | unsigned long first_page, next_page; /* extent of pages */ | ||
188 | struct writeback_control *wbc; | ||
189 | int io_done; | ||
190 | int pages_written; | ||
191 | int retval; | ||
192 | }; | ||
193 | |||
194 | /* | ||
195 | * Flags for ext4_io_end->flags | 180 | * Flags for ext4_io_end->flags |
196 | */ | 181 | */ |
197 | #define EXT4_IO_END_UNWRITTEN 0x0001 | 182 | #define EXT4_IO_END_UNWRITTEN 0x0001 |
198 | #define EXT4_IO_END_ERROR 0x0002 | 183 | #define EXT4_IO_END_DIRECT 0x0002 |
199 | #define EXT4_IO_END_DIRECT 0x0004 | ||
200 | 184 | ||
201 | /* | 185 | /* |
202 | * For converting uninitialized extents on a work queue. | 186 | * For converting uninitialized extents on a work queue. 'handle' is used for |
187 | * buffered writeback. | ||
203 | */ | 188 | */ |
204 | typedef struct ext4_io_end { | 189 | typedef struct ext4_io_end { |
205 | struct list_head list; /* per-file finished IO list */ | 190 | struct list_head list; /* per-file finished IO list */ |
191 | handle_t *handle; /* handle reserved for extent | ||
192 | * conversion */ | ||
206 | struct inode *inode; /* file being written to */ | 193 | struct inode *inode; /* file being written to */ |
194 | struct bio *bio; /* Linked list of completed | ||
195 | * bios covering the extent */ | ||
207 | unsigned int flag; /* unwritten or not */ | 196 | unsigned int flag; /* unwritten or not */ |
208 | loff_t offset; /* offset in the file */ | 197 | loff_t offset; /* offset in the file */ |
209 | ssize_t size; /* size of the extent */ | 198 | ssize_t size; /* size of the extent */ |
210 | struct kiocb *iocb; /* iocb struct for AIO */ | 199 | struct kiocb *iocb; /* iocb struct for AIO */ |
211 | int result; /* error value for AIO */ | 200 | int result; /* error value for AIO */ |
201 | atomic_t count; /* reference counter */ | ||
212 | } ext4_io_end_t; | 202 | } ext4_io_end_t; |
213 | 203 | ||
214 | struct ext4_io_submit { | 204 | struct ext4_io_submit { |
@@ -581,11 +571,6 @@ enum { | |||
581 | #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 | 571 | #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 |
582 | 572 | ||
583 | /* | 573 | /* |
584 | * Flags used by ext4_discard_partial_page_buffers | ||
585 | */ | ||
586 | #define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 | ||
587 | |||
588 | /* | ||
589 | * ioctl commands | 574 | * ioctl commands |
590 | */ | 575 | */ |
591 | #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS | 576 | #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS |
@@ -879,6 +864,7 @@ struct ext4_inode_info { | |||
879 | rwlock_t i_es_lock; | 864 | rwlock_t i_es_lock; |
880 | struct list_head i_es_lru; | 865 | struct list_head i_es_lru; |
881 | unsigned int i_es_lru_nr; /* protected by i_es_lock */ | 866 | unsigned int i_es_lru_nr; /* protected by i_es_lock */ |
867 | unsigned long i_touch_when; /* jiffies of last accessing */ | ||
882 | 868 | ||
883 | /* ialloc */ | 869 | /* ialloc */ |
884 | ext4_group_t i_last_alloc_group; | 870 | ext4_group_t i_last_alloc_group; |
@@ -903,12 +889,22 @@ struct ext4_inode_info { | |||
903 | qsize_t i_reserved_quota; | 889 | qsize_t i_reserved_quota; |
904 | #endif | 890 | #endif |
905 | 891 | ||
906 | /* completed IOs that might need unwritten extents handling */ | 892 | /* Lock protecting lists below */ |
907 | struct list_head i_completed_io_list; | ||
908 | spinlock_t i_completed_io_lock; | 893 | spinlock_t i_completed_io_lock; |
894 | /* | ||
895 | * Completed IOs that need unwritten extents handling and have | ||
896 | * transaction reserved | ||
897 | */ | ||
898 | struct list_head i_rsv_conversion_list; | ||
899 | /* | ||
900 | * Completed IOs that need unwritten extents handling and don't have | ||
901 | * transaction reserved | ||
902 | */ | ||
903 | struct list_head i_unrsv_conversion_list; | ||
909 | atomic_t i_ioend_count; /* Number of outstanding io_end structs */ | 904 | atomic_t i_ioend_count; /* Number of outstanding io_end structs */ |
910 | atomic_t i_unwritten; /* Nr. of inflight conversions pending */ | 905 | atomic_t i_unwritten; /* Nr. of inflight conversions pending */ |
911 | struct work_struct i_unwritten_work; /* deferred extent conversion */ | 906 | struct work_struct i_rsv_conversion_work; |
907 | struct work_struct i_unrsv_conversion_work; | ||
912 | 908 | ||
913 | spinlock_t i_block_reservation_lock; | 909 | spinlock_t i_block_reservation_lock; |
914 | 910 | ||
@@ -1245,7 +1241,6 @@ struct ext4_sb_info { | |||
1245 | unsigned int s_mb_stats; | 1241 | unsigned int s_mb_stats; |
1246 | unsigned int s_mb_order2_reqs; | 1242 | unsigned int s_mb_order2_reqs; |
1247 | unsigned int s_mb_group_prealloc; | 1243 | unsigned int s_mb_group_prealloc; |
1248 | unsigned int s_max_writeback_mb_bump; | ||
1249 | unsigned int s_max_dir_size_kb; | 1244 | unsigned int s_max_dir_size_kb; |
1250 | /* where last allocation was done - for stream allocation */ | 1245 | /* where last allocation was done - for stream allocation */ |
1251 | unsigned long s_mb_last_group; | 1246 | unsigned long s_mb_last_group; |
@@ -1281,8 +1276,10 @@ struct ext4_sb_info { | |||
1281 | struct flex_groups *s_flex_groups; | 1276 | struct flex_groups *s_flex_groups; |
1282 | ext4_group_t s_flex_groups_allocated; | 1277 | ext4_group_t s_flex_groups_allocated; |
1283 | 1278 | ||
1284 | /* workqueue for dio unwritten */ | 1279 | /* workqueue for unreserved extent convertions (dio) */ |
1285 | struct workqueue_struct *dio_unwritten_wq; | 1280 | struct workqueue_struct *unrsv_conversion_wq; |
1281 | /* workqueue for reserved extent conversions (buffered io) */ | ||
1282 | struct workqueue_struct *rsv_conversion_wq; | ||
1286 | 1283 | ||
1287 | /* timer for periodic error stats printing */ | 1284 | /* timer for periodic error stats printing */ |
1288 | struct timer_list s_err_report; | 1285 | struct timer_list s_err_report; |
@@ -1307,6 +1304,7 @@ struct ext4_sb_info { | |||
1307 | /* Reclaim extents from extent status tree */ | 1304 | /* Reclaim extents from extent status tree */ |
1308 | struct shrinker s_es_shrinker; | 1305 | struct shrinker s_es_shrinker; |
1309 | struct list_head s_es_lru; | 1306 | struct list_head s_es_lru; |
1307 | unsigned long s_es_last_sorted; | ||
1310 | struct percpu_counter s_extent_cache_cnt; | 1308 | struct percpu_counter s_extent_cache_cnt; |
1311 | spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; | 1309 | spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; |
1312 | }; | 1310 | }; |
@@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, | |||
1342 | struct ext4_io_end *io_end) | 1340 | struct ext4_io_end *io_end) |
1343 | { | 1341 | { |
1344 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | 1342 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { |
1343 | /* Writeback has to have coversion transaction reserved */ | ||
1344 | WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && | ||
1345 | !(io_end->flag & EXT4_IO_END_DIRECT)); | ||
1345 | io_end->flag |= EXT4_IO_END_UNWRITTEN; | 1346 | io_end->flag |= EXT4_IO_END_UNWRITTEN; |
1346 | atomic_inc(&EXT4_I(inode)->i_unwritten); | 1347 | atomic_inc(&EXT4_I(inode)->i_unwritten); |
1347 | } | 1348 | } |
@@ -1999,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype) | |||
1999 | 2000 | ||
2000 | /* fsync.c */ | 2001 | /* fsync.c */ |
2001 | extern int ext4_sync_file(struct file *, loff_t, loff_t, int); | 2002 | extern int ext4_sync_file(struct file *, loff_t, loff_t, int); |
2002 | extern int ext4_flush_unwritten_io(struct inode *); | ||
2003 | 2003 | ||
2004 | /* hash.c */ | 2004 | /* hash.c */ |
2005 | extern int ext4fs_dirhash(const char *name, int len, struct | 2005 | extern int ext4fs_dirhash(const char *name, int len, struct |
@@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); | |||
2088 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); | 2088 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); |
2089 | extern int ext4_can_truncate(struct inode *inode); | 2089 | extern int ext4_can_truncate(struct inode *inode); |
2090 | extern void ext4_truncate(struct inode *); | 2090 | extern void ext4_truncate(struct inode *); |
2091 | extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); | 2091 | extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); |
2092 | extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); | 2092 | extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); |
2093 | extern void ext4_set_inode_flags(struct inode *); | 2093 | extern void ext4_set_inode_flags(struct inode *); |
2094 | extern void ext4_get_inode_flags(struct ext4_inode_info *); | 2094 | extern void ext4_get_inode_flags(struct ext4_inode_info *); |
@@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode); | |||
2096 | extern void ext4_set_aops(struct inode *inode); | 2096 | extern void ext4_set_aops(struct inode *inode); |
2097 | extern int ext4_writepage_trans_blocks(struct inode *); | 2097 | extern int ext4_writepage_trans_blocks(struct inode *); |
2098 | extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); | 2098 | extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); |
2099 | extern int ext4_discard_partial_page_buffers(handle_t *handle, | 2099 | extern int ext4_block_truncate_page(handle_t *handle, |
2100 | struct address_space *mapping, loff_t from, | 2100 | struct address_space *mapping, loff_t from); |
2101 | loff_t length, int flags); | 2101 | extern int ext4_block_zero_page_range(handle_t *handle, |
2102 | struct address_space *mapping, loff_t from, loff_t length); | ||
2103 | extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, | ||
2104 | loff_t lstart, loff_t lend); | ||
2102 | extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 2105 | extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
2103 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); | 2106 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); |
2104 | extern void ext4_da_update_reserve_space(struct inode *inode, | 2107 | extern void ext4_da_update_reserve_space(struct inode *inode, |
@@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | |||
2111 | const struct iovec *iov, loff_t offset, | 2114 | const struct iovec *iov, loff_t offset, |
2112 | unsigned long nr_segs); | 2115 | unsigned long nr_segs); |
2113 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); | 2116 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); |
2114 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); | 2117 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); |
2115 | extern void ext4_ind_truncate(handle_t *, struct inode *inode); | 2118 | extern void ext4_ind_truncate(handle_t *, struct inode *inode); |
2116 | extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, | 2119 | extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, |
2117 | ext4_lblk_t first, ext4_lblk_t stop); | 2120 | ext4_lblk_t first, ext4_lblk_t stop); |
@@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb, | |||
2166 | ext4_group_t ngroup); | 2169 | ext4_group_t ngroup); |
2167 | extern const char *ext4_decode_error(struct super_block *sb, int errno, | 2170 | extern const char *ext4_decode_error(struct super_block *sb, int errno, |
2168 | char nbuf[16]); | 2171 | char nbuf[16]); |
2172 | |||
2169 | extern __printf(4, 5) | 2173 | extern __printf(4, 5) |
2170 | void __ext4_error(struct super_block *, const char *, unsigned int, | 2174 | void __ext4_error(struct super_block *, const char *, unsigned int, |
2171 | const char *, ...); | 2175 | const char *, ...); |
2172 | #define ext4_error(sb, message...) __ext4_error(sb, __func__, \ | ||
2173 | __LINE__, ## message) | ||
2174 | extern __printf(5, 6) | 2176 | extern __printf(5, 6) |
2175 | void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, | 2177 | void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, |
2176 | const char *, ...); | 2178 | const char *, ...); |
2177 | extern __printf(5, 6) | 2179 | extern __printf(5, 6) |
2178 | void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, | 2180 | void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, |
2179 | const char *, ...); | 2181 | const char *, ...); |
2180 | extern void __ext4_std_error(struct super_block *, const char *, | 2182 | extern void __ext4_std_error(struct super_block *, const char *, |
2181 | unsigned int, int); | 2183 | unsigned int, int); |
2182 | extern __printf(4, 5) | 2184 | extern __printf(4, 5) |
2183 | void __ext4_abort(struct super_block *, const char *, unsigned int, | 2185 | void __ext4_abort(struct super_block *, const char *, unsigned int, |
2184 | const char *, ...); | 2186 | const char *, ...); |
2185 | #define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ | ||
2186 | __LINE__, ## message) | ||
2187 | extern __printf(4, 5) | 2187 | extern __printf(4, 5) |
2188 | void __ext4_warning(struct super_block *, const char *, unsigned int, | 2188 | void __ext4_warning(struct super_block *, const char *, unsigned int, |
2189 | const char *, ...); | 2189 | const char *, ...); |
2190 | #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ | ||
2191 | __LINE__, ## message) | ||
2192 | extern __printf(3, 4) | 2190 | extern __printf(3, 4) |
2193 | void ext4_msg(struct super_block *, const char *, const char *, ...); | 2191 | void __ext4_msg(struct super_block *, const char *, const char *, ...); |
2194 | extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, | 2192 | extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, |
2195 | const char *, unsigned int, const char *); | 2193 | const char *, unsigned int, const char *); |
2196 | #define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ | ||
2197 | __LINE__, msg) | ||
2198 | extern __printf(7, 8) | 2194 | extern __printf(7, 8) |
2199 | void __ext4_grp_locked_error(const char *, unsigned int, | 2195 | void __ext4_grp_locked_error(const char *, unsigned int, |
2200 | struct super_block *, ext4_group_t, | 2196 | struct super_block *, ext4_group_t, |
2201 | unsigned long, ext4_fsblk_t, | 2197 | unsigned long, ext4_fsblk_t, |
2202 | const char *, ...); | 2198 | const char *, ...); |
2203 | #define ext4_grp_locked_error(sb, grp, message...) \ | 2199 | |
2204 | __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) | 2200 | #ifdef CONFIG_PRINTK |
2201 | |||
2202 | #define ext4_error_inode(inode, func, line, block, fmt, ...) \ | ||
2203 | __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) | ||
2204 | #define ext4_error_file(file, func, line, block, fmt, ...) \ | ||
2205 | __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) | ||
2206 | #define ext4_error(sb, fmt, ...) \ | ||
2207 | __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) | ||
2208 | #define ext4_abort(sb, fmt, ...) \ | ||
2209 | __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) | ||
2210 | #define ext4_warning(sb, fmt, ...) \ | ||
2211 | __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) | ||
2212 | #define ext4_msg(sb, level, fmt, ...) \ | ||
2213 | __ext4_msg(sb, level, fmt, ##__VA_ARGS__) | ||
2214 | #define dump_mmp_msg(sb, mmp, msg) \ | ||
2215 | __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) | ||
2216 | #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ | ||
2217 | __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ | ||
2218 | fmt, ##__VA_ARGS__) | ||
2219 | |||
2220 | #else | ||
2221 | |||
2222 | #define ext4_error_inode(inode, func, line, block, fmt, ...) \ | ||
2223 | do { \ | ||
2224 | no_printk(fmt, ##__VA_ARGS__); \ | ||
2225 | __ext4_error_inode(inode, "", 0, block, " "); \ | ||
2226 | } while (0) | ||
2227 | #define ext4_error_file(file, func, line, block, fmt, ...) \ | ||
2228 | do { \ | ||
2229 | no_printk(fmt, ##__VA_ARGS__); \ | ||
2230 | __ext4_error_file(file, "", 0, block, " "); \ | ||
2231 | } while (0) | ||
2232 | #define ext4_error(sb, fmt, ...) \ | ||
2233 | do { \ | ||
2234 | no_printk(fmt, ##__VA_ARGS__); \ | ||
2235 | __ext4_error(sb, "", 0, " "); \ | ||
2236 | } while (0) | ||
2237 | #define ext4_abort(sb, fmt, ...) \ | ||
2238 | do { \ | ||
2239 | no_printk(fmt, ##__VA_ARGS__); \ | ||
2240 | __ext4_abort(sb, "", 0, " "); \ | ||
2241 | } while (0) | ||
2242 | #define ext4_warning(sb, fmt, ...) \ | ||
2243 | do { \ | ||
2244 | no_printk(fmt, ##__VA_ARGS__); \ | ||
2245 | __ext4_warning(sb, "", 0, " "); \ | ||
2246 | } while (0) | ||
2247 | #define ext4_msg(sb, level, fmt, ...) \ | ||
2248 | do { \ | ||
2249 | no_printk(fmt, ##__VA_ARGS__); \ | ||
2250 | __ext4_msg(sb, "", " "); \ | ||
2251 | } while (0) | ||
2252 | #define dump_mmp_msg(sb, mmp, msg) \ | ||
2253 | __dump_mmp_msg(sb, mmp, "", 0, "") | ||
2254 | #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ | ||
2255 | do { \ | ||
2256 | no_printk(fmt, ##__VA_ARGS__); \ | ||
2257 | __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ | ||
2258 | } while (0) | ||
2259 | |||
2260 | #endif | ||
2261 | |||
2205 | extern void ext4_update_dynamic_rev(struct super_block *sb); | 2262 | extern void ext4_update_dynamic_rev(struct super_block *sb); |
2206 | extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, | 2263 | extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, |
2207 | __u32 compat); | 2264 | __u32 compat); |
@@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, | |||
2312 | { | 2369 | { |
2313 | struct ext4_group_info ***grp_info; | 2370 | struct ext4_group_info ***grp_info; |
2314 | long indexv, indexh; | 2371 | long indexv, indexh; |
2372 | BUG_ON(group >= EXT4_SB(sb)->s_groups_count); | ||
2315 | grp_info = EXT4_SB(sb)->s_group_info; | 2373 | grp_info = EXT4_SB(sb)->s_group_info; |
2316 | indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); | 2374 | indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); |
2317 | indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); | 2375 | indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); |
@@ -2598,8 +2656,7 @@ struct ext4_extent; | |||
2598 | 2656 | ||
2599 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); | 2657 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); |
2600 | extern int ext4_ext_writepage_trans_blocks(struct inode *, int); | 2658 | extern int ext4_ext_writepage_trans_blocks(struct inode *, int); |
2601 | extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, | 2659 | extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); |
2602 | int chunk); | ||
2603 | extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | 2660 | extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, |
2604 | struct ext4_map_blocks *map, int flags); | 2661 | struct ext4_map_blocks *map, int flags); |
2605 | extern void ext4_ext_truncate(handle_t *, struct inode *); | 2662 | extern void ext4_ext_truncate(handle_t *, struct inode *); |
@@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *); | |||
2609 | extern void ext4_ext_release(struct super_block *); | 2666 | extern void ext4_ext_release(struct super_block *); |
2610 | extern long ext4_fallocate(struct file *file, int mode, loff_t offset, | 2667 | extern long ext4_fallocate(struct file *file, int mode, loff_t offset, |
2611 | loff_t len); | 2668 | loff_t len); |
2612 | extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, | 2669 | extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, |
2613 | ssize_t len); | 2670 | loff_t offset, ssize_t len); |
2614 | extern int ext4_map_blocks(handle_t *handle, struct inode *inode, | 2671 | extern int ext4_map_blocks(handle_t *handle, struct inode *inode, |
2615 | struct ext4_map_blocks *map, int flags); | 2672 | struct ext4_map_blocks *map, int flags); |
2616 | extern int ext4_ext_calc_metadata_amount(struct inode *inode, | 2673 | extern int ext4_ext_calc_metadata_amount(struct inode *inode, |
@@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
2650 | 2707 | ||
2651 | /* page-io.c */ | 2708 | /* page-io.c */ |
2652 | extern int __init ext4_init_pageio(void); | 2709 | extern int __init ext4_init_pageio(void); |
2653 | extern void ext4_add_complete_io(ext4_io_end_t *io_end); | ||
2654 | extern void ext4_exit_pageio(void); | 2710 | extern void ext4_exit_pageio(void); |
2655 | extern void ext4_ioend_shutdown(struct inode *); | ||
2656 | extern void ext4_free_io_end(ext4_io_end_t *io); | ||
2657 | extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); | 2711 | extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); |
2658 | extern void ext4_end_io_work(struct work_struct *work); | 2712 | extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); |
2713 | extern int ext4_put_io_end(ext4_io_end_t *io_end); | ||
2714 | extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); | ||
2715 | extern void ext4_io_submit_init(struct ext4_io_submit *io, | ||
2716 | struct writeback_control *wbc); | ||
2717 | extern void ext4_end_io_rsv_work(struct work_struct *work); | ||
2718 | extern void ext4_end_io_unrsv_work(struct work_struct *work); | ||
2659 | extern void ext4_io_submit(struct ext4_io_submit *io); | 2719 | extern void ext4_io_submit(struct ext4_io_submit *io); |
2660 | extern int ext4_bio_write_page(struct ext4_io_submit *io, | 2720 | extern int ext4_bio_write_page(struct ext4_io_submit *io, |
2661 | struct page *page, | 2721 | struct page *page, |
@@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); | |||
2668 | extern int ext4_mmp_csum_verify(struct super_block *sb, | 2728 | extern int ext4_mmp_csum_verify(struct super_block *sb, |
2669 | struct mmp_struct *mmp); | 2729 | struct mmp_struct *mmp); |
2670 | 2730 | ||
2671 | /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ | 2731 | /* |
2732 | * Note that these flags will never ever appear in a buffer_head's state flag. | ||
2733 | * See EXT4_MAP_... to see where this is used. | ||
2734 | */ | ||
2672 | enum ext4_state_bits { | 2735 | enum ext4_state_bits { |
2673 | BH_Uninit /* blocks are allocated but uninitialized on disk */ | 2736 | BH_Uninit /* blocks are allocated but uninitialized on disk */ |
2674 | = BH_JBDPrivateStart, | 2737 | = BH_JBDPrivateStart, |
2675 | BH_AllocFromCluster, /* allocated blocks were part of already | 2738 | BH_AllocFromCluster, /* allocated blocks were part of already |
2676 | * allocated cluster. Note that this flag will | 2739 | * allocated cluster. */ |
2677 | * never, ever appear in a buffer_head's state | ||
2678 | * flag. See EXT4_MAP_FROM_CLUSTER to see where | ||
2679 | * this is used. */ | ||
2680 | }; | 2740 | }; |
2681 | 2741 | ||
2682 | BUFFER_FNS(Uninit, uninit) | ||
2683 | TAS_BUFFER_FNS(Uninit, uninit) | ||
2684 | |||
2685 | /* | 2742 | /* |
2686 | * Add new method to test whether block and inode bitmaps are properly | 2743 | * Add new method to test whether block and inode bitmaps are properly |
2687 | * initialized. With uninit_bg reading the block from disk is not enough | 2744 | * initialized. With uninit_bg reading the block from disk is not enough |
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 451eb4045330..72a3600aedbd 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c | |||
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle) | |||
38 | /* | 38 | /* |
39 | * Wrappers for jbd2_journal_start/end. | 39 | * Wrappers for jbd2_journal_start/end. |
40 | */ | 40 | */ |
41 | handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, | 41 | static int ext4_journal_check_start(struct super_block *sb) |
42 | int type, int nblocks) | ||
43 | { | 42 | { |
44 | journal_t *journal; | 43 | journal_t *journal; |
45 | 44 | ||
46 | might_sleep(); | 45 | might_sleep(); |
47 | |||
48 | trace_ext4_journal_start(sb, nblocks, _RET_IP_); | ||
49 | if (sb->s_flags & MS_RDONLY) | 46 | if (sb->s_flags & MS_RDONLY) |
50 | return ERR_PTR(-EROFS); | 47 | return -EROFS; |
51 | |||
52 | WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); | 48 | WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); |
53 | journal = EXT4_SB(sb)->s_journal; | 49 | journal = EXT4_SB(sb)->s_journal; |
54 | if (!journal) | ||
55 | return ext4_get_nojournal(); | ||
56 | /* | 50 | /* |
57 | * Special case here: if the journal has aborted behind our | 51 | * Special case here: if the journal has aborted behind our |
58 | * backs (eg. EIO in the commit thread), then we still need to | 52 | * backs (eg. EIO in the commit thread), then we still need to |
59 | * take the FS itself readonly cleanly. | 53 | * take the FS itself readonly cleanly. |
60 | */ | 54 | */ |
61 | if (is_journal_aborted(journal)) { | 55 | if (journal && is_journal_aborted(journal)) { |
62 | ext4_abort(sb, "Detected aborted journal"); | 56 | ext4_abort(sb, "Detected aborted journal"); |
63 | return ERR_PTR(-EROFS); | 57 | return -EROFS; |
64 | } | 58 | } |
65 | return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); | 59 | return 0; |
60 | } | ||
61 | |||
62 | handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, | ||
63 | int type, int blocks, int rsv_blocks) | ||
64 | { | ||
65 | journal_t *journal; | ||
66 | int err; | ||
67 | |||
68 | trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); | ||
69 | err = ext4_journal_check_start(sb); | ||
70 | if (err < 0) | ||
71 | return ERR_PTR(err); | ||
72 | |||
73 | journal = EXT4_SB(sb)->s_journal; | ||
74 | if (!journal) | ||
75 | return ext4_get_nojournal(); | ||
76 | return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, | ||
77 | type, line); | ||
66 | } | 78 | } |
67 | 79 | ||
68 | int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) | 80 | int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) |
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) | |||
86 | return err; | 98 | return err; |
87 | } | 99 | } |
88 | 100 | ||
101 | handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, | ||
102 | int type) | ||
103 | { | ||
104 | struct super_block *sb; | ||
105 | int err; | ||
106 | |||
107 | if (!ext4_handle_valid(handle)) | ||
108 | return ext4_get_nojournal(); | ||
109 | |||
110 | sb = handle->h_journal->j_private; | ||
111 | trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, | ||
112 | _RET_IP_); | ||
113 | err = ext4_journal_check_start(sb); | ||
114 | if (err < 0) { | ||
115 | jbd2_journal_free_reserved(handle); | ||
116 | return ERR_PTR(err); | ||
117 | } | ||
118 | |||
119 | err = jbd2_journal_start_reserved(handle, type, line); | ||
120 | if (err < 0) | ||
121 | return ERR_PTR(err); | ||
122 | return handle; | ||
123 | } | ||
124 | |||
89 | void ext4_journal_abort_handle(const char *caller, unsigned int line, | 125 | void ext4_journal_abort_handle(const char *caller, unsigned int line, |
90 | const char *err_fn, struct buffer_head *bh, | 126 | const char *err_fn, struct buffer_head *bh, |
91 | handle_t *handle, int err) | 127 | handle_t *handle, int err) |
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index c8c6885406db..2877258d9497 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h | |||
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode) | |||
134 | #define EXT4_HT_MIGRATE 8 | 134 | #define EXT4_HT_MIGRATE 8 |
135 | #define EXT4_HT_MOVE_EXTENTS 9 | 135 | #define EXT4_HT_MOVE_EXTENTS 9 |
136 | #define EXT4_HT_XATTR 10 | 136 | #define EXT4_HT_XATTR 10 |
137 | #define EXT4_HT_MAX 11 | 137 | #define EXT4_HT_EXT_CONVERT 11 |
138 | #define EXT4_HT_MAX 12 | ||
138 | 139 | ||
139 | /** | 140 | /** |
140 | * struct ext4_journal_cb_entry - Base structure for callback information. | 141 | * struct ext4_journal_cb_entry - Base structure for callback information. |
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, | |||
265 | __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) | 266 | __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) |
266 | 267 | ||
267 | handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, | 268 | handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, |
268 | int type, int nblocks); | 269 | int type, int blocks, int rsv_blocks); |
269 | int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); | 270 | int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); |
270 | 271 | ||
271 | #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) | 272 | #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) |
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) | |||
300 | } | 301 | } |
301 | 302 | ||
302 | #define ext4_journal_start_sb(sb, type, nblocks) \ | 303 | #define ext4_journal_start_sb(sb, type, nblocks) \ |
303 | __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) | 304 | __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) |
304 | 305 | ||
305 | #define ext4_journal_start(inode, type, nblocks) \ | 306 | #define ext4_journal_start(inode, type, nblocks) \ |
306 | __ext4_journal_start((inode), __LINE__, (type), (nblocks)) | 307 | __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) |
308 | |||
309 | #define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ | ||
310 | __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) | ||
307 | 311 | ||
308 | static inline handle_t *__ext4_journal_start(struct inode *inode, | 312 | static inline handle_t *__ext4_journal_start(struct inode *inode, |
309 | unsigned int line, int type, | 313 | unsigned int line, int type, |
310 | int nblocks) | 314 | int blocks, int rsv_blocks) |
311 | { | 315 | { |
312 | return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); | 316 | return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, |
317 | rsv_blocks); | ||
313 | } | 318 | } |
314 | 319 | ||
315 | #define ext4_journal_stop(handle) \ | 320 | #define ext4_journal_stop(handle) \ |
316 | __ext4_journal_stop(__func__, __LINE__, (handle)) | 321 | __ext4_journal_stop(__func__, __LINE__, (handle)) |
317 | 322 | ||
323 | #define ext4_journal_start_reserved(handle, type) \ | ||
324 | __ext4_journal_start_reserved((handle), __LINE__, (type)) | ||
325 | |||
326 | handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, | ||
327 | int type); | ||
328 | |||
329 | static inline void ext4_journal_free_reserved(handle_t *handle) | ||
330 | { | ||
331 | if (ext4_handle_valid(handle)) | ||
332 | jbd2_journal_free_reserved(handle); | ||
333 | } | ||
334 | |||
318 | static inline handle_t *ext4_journal_current_handle(void) | 335 | static inline handle_t *ext4_journal_current_handle(void) |
319 | { | 336 | { |
320 | return journal_current_handle(); | 337 | return journal_current_handle(); |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bc0f1910b9cf..7097b0f680e6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode, | |||
2125 | next_del = ext4_find_delayed_extent(inode, &es); | 2125 | next_del = ext4_find_delayed_extent(inode, &es); |
2126 | if (!exists && next_del) { | 2126 | if (!exists && next_del) { |
2127 | exists = 1; | 2127 | exists = 1; |
2128 | flags |= FIEMAP_EXTENT_DELALLOC; | 2128 | flags |= (FIEMAP_EXTENT_DELALLOC | |
2129 | FIEMAP_EXTENT_UNKNOWN); | ||
2129 | } | 2130 | } |
2130 | up_read(&EXT4_I(inode)->i_data_sem); | 2131 | up_read(&EXT4_I(inode)->i_data_sem); |
2131 | 2132 | ||
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, | |||
2328 | } | 2329 | } |
2329 | 2330 | ||
2330 | /* | 2331 | /* |
2331 | * How many index/leaf blocks need to change/allocate to modify nrblocks? | 2332 | * How many index/leaf blocks need to change/allocate to add @extents extents? |
2332 | * | 2333 | * |
2333 | * if nrblocks are fit in a single extent (chunk flag is 1), then | 2334 | * If we add a single extent, then in the worse case, each tree level |
2334 | * in the worse case, each tree level index/leaf need to be changed | 2335 | * index/leaf need to be changed in case of the tree split. |
2335 | * if the tree split due to insert a new extent, then the old tree | ||
2336 | * index/leaf need to be updated too | ||
2337 | * | 2336 | * |
2338 | * If the nrblocks are discontiguous, they could cause | 2337 | * If more extents are inserted, they could cause the whole tree split more |
2339 | * the whole tree split more than once, but this is really rare. | 2338 | * than once, but this is really rare. |
2340 | */ | 2339 | */ |
2341 | int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 2340 | int ext4_ext_index_trans_blocks(struct inode *inode, int extents) |
2342 | { | 2341 | { |
2343 | int index; | 2342 | int index; |
2344 | int depth; | 2343 | int depth; |
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
2349 | 2348 | ||
2350 | depth = ext_depth(inode); | 2349 | depth = ext_depth(inode); |
2351 | 2350 | ||
2352 | if (chunk) | 2351 | if (extents <= 1) |
2353 | index = depth * 2; | 2352 | index = depth * 2; |
2354 | else | 2353 | else |
2355 | index = depth * 3; | 2354 | index = depth * 3; |
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
2357 | return index; | 2356 | return index; |
2358 | } | 2357 | } |
2359 | 2358 | ||
2359 | static inline int get_default_free_blocks_flags(struct inode *inode) | ||
2360 | { | ||
2361 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
2362 | return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; | ||
2363 | else if (ext4_should_journal_data(inode)) | ||
2364 | return EXT4_FREE_BLOCKS_FORGET; | ||
2365 | return 0; | ||
2366 | } | ||
2367 | |||
2360 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | 2368 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, |
2361 | struct ext4_extent *ex, | 2369 | struct ext4_extent *ex, |
2362 | ext4_fsblk_t *partial_cluster, | 2370 | long long *partial_cluster, |
2363 | ext4_lblk_t from, ext4_lblk_t to) | 2371 | ext4_lblk_t from, ext4_lblk_t to) |
2364 | { | 2372 | { |
2365 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2373 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
2366 | unsigned short ee_len = ext4_ext_get_actual_len(ex); | 2374 | unsigned short ee_len = ext4_ext_get_actual_len(ex); |
2367 | ext4_fsblk_t pblk; | 2375 | ext4_fsblk_t pblk; |
2368 | int flags = 0; | 2376 | int flags = get_default_free_blocks_flags(inode); |
2369 | |||
2370 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
2371 | flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; | ||
2372 | else if (ext4_should_journal_data(inode)) | ||
2373 | flags |= EXT4_FREE_BLOCKS_FORGET; | ||
2374 | 2377 | ||
2375 | /* | 2378 | /* |
2376 | * For bigalloc file systems, we never free a partial cluster | 2379 | * For bigalloc file systems, we never free a partial cluster |
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | |||
2388 | * partial cluster here. | 2391 | * partial cluster here. |
2389 | */ | 2392 | */ |
2390 | pblk = ext4_ext_pblock(ex) + ee_len - 1; | 2393 | pblk = ext4_ext_pblock(ex) + ee_len - 1; |
2391 | if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { | 2394 | if ((*partial_cluster > 0) && |
2395 | (EXT4_B2C(sbi, pblk) != *partial_cluster)) { | ||
2392 | ext4_free_blocks(handle, inode, NULL, | 2396 | ext4_free_blocks(handle, inode, NULL, |
2393 | EXT4_C2B(sbi, *partial_cluster), | 2397 | EXT4_C2B(sbi, *partial_cluster), |
2394 | sbi->s_cluster_ratio, flags); | 2398 | sbi->s_cluster_ratio, flags); |
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | |||
2414 | && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { | 2418 | && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { |
2415 | /* tail removal */ | 2419 | /* tail removal */ |
2416 | ext4_lblk_t num; | 2420 | ext4_lblk_t num; |
2421 | unsigned int unaligned; | ||
2417 | 2422 | ||
2418 | num = le32_to_cpu(ex->ee_block) + ee_len - from; | 2423 | num = le32_to_cpu(ex->ee_block) + ee_len - from; |
2419 | pblk = ext4_ext_pblock(ex) + ee_len - num; | 2424 | pblk = ext4_ext_pblock(ex) + ee_len - num; |
2420 | ext_debug("free last %u blocks starting %llu\n", num, pblk); | 2425 | /* |
2426 | * Usually we want to free partial cluster at the end of the | ||
2427 | * extent, except for the situation when the cluster is still | ||
2428 | * used by any other extent (partial_cluster is negative). | ||
2429 | */ | ||
2430 | if (*partial_cluster < 0 && | ||
2431 | -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) | ||
2432 | flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; | ||
2433 | |||
2434 | ext_debug("free last %u blocks starting %llu partial %lld\n", | ||
2435 | num, pblk, *partial_cluster); | ||
2421 | ext4_free_blocks(handle, inode, NULL, pblk, num, flags); | 2436 | ext4_free_blocks(handle, inode, NULL, pblk, num, flags); |
2422 | /* | 2437 | /* |
2423 | * If the block range to be freed didn't start at the | 2438 | * If the block range to be freed didn't start at the |
2424 | * beginning of a cluster, and we removed the entire | 2439 | * beginning of a cluster, and we removed the entire |
2425 | * extent, save the partial cluster here, since we | 2440 | * extent and the cluster is not used by any other extent, |
2426 | * might need to delete if we determine that the | 2441 | * save the partial cluster here, since we might need to |
2427 | * truncate operation has removed all of the blocks in | 2442 | * delete if we determine that the truncate operation has |
2428 | * the cluster. | 2443 | * removed all of the blocks in the cluster. |
2444 | * | ||
2445 | * On the other hand, if we did not manage to free the whole | ||
2446 | * extent, we have to mark the cluster as used (store negative | ||
2447 | * cluster number in partial_cluster). | ||
2429 | */ | 2448 | */ |
2430 | if (pblk & (sbi->s_cluster_ratio - 1) && | 2449 | unaligned = pblk & (sbi->s_cluster_ratio - 1); |
2431 | (ee_len == num)) | 2450 | if (unaligned && (ee_len == num) && |
2451 | (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) | ||
2432 | *partial_cluster = EXT4_B2C(sbi, pblk); | 2452 | *partial_cluster = EXT4_B2C(sbi, pblk); |
2433 | else | 2453 | else if (unaligned) |
2454 | *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); | ||
2455 | else if (*partial_cluster > 0) | ||
2434 | *partial_cluster = 0; | 2456 | *partial_cluster = 0; |
2435 | } else if (from == le32_to_cpu(ex->ee_block) | 2457 | } else |
2436 | && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { | 2458 | ext4_error(sbi->s_sb, "strange request: removal(2) " |
2437 | /* head removal */ | 2459 | "%u-%u from %u:%u\n", |
2438 | ext4_lblk_t num; | 2460 | from, to, le32_to_cpu(ex->ee_block), ee_len); |
2439 | ext4_fsblk_t start; | ||
2440 | |||
2441 | num = to - from; | ||
2442 | start = ext4_ext_pblock(ex); | ||
2443 | |||
2444 | ext_debug("free first %u blocks starting %llu\n", num, start); | ||
2445 | ext4_free_blocks(handle, inode, NULL, start, num, flags); | ||
2446 | |||
2447 | } else { | ||
2448 | printk(KERN_INFO "strange request: removal(2) " | ||
2449 | "%u-%u from %u:%u\n", | ||
2450 | from, to, le32_to_cpu(ex->ee_block), ee_len); | ||
2451 | } | ||
2452 | return 0; | 2461 | return 0; |
2453 | } | 2462 | } |
2454 | 2463 | ||
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | |||
2461 | * @handle: The journal handle | 2470 | * @handle: The journal handle |
2462 | * @inode: The files inode | 2471 | * @inode: The files inode |
2463 | * @path: The path to the leaf | 2472 | * @path: The path to the leaf |
2473 | * @partial_cluster: The cluster which we'll have to free if all extents | ||
2474 | * has been released from it. It gets negative in case | ||
2475 | * that the cluster is still used. | ||
2464 | * @start: The first block to remove | 2476 | * @start: The first block to remove |
2465 | * @end: The last block to remove | 2477 | * @end: The last block to remove |
2466 | */ | 2478 | */ |
2467 | static int | 2479 | static int |
2468 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | 2480 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, |
2469 | struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, | 2481 | struct ext4_ext_path *path, |
2482 | long long *partial_cluster, | ||
2470 | ext4_lblk_t start, ext4_lblk_t end) | 2483 | ext4_lblk_t start, ext4_lblk_t end) |
2471 | { | 2484 | { |
2472 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2485 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2479 | unsigned short ex_ee_len; | 2492 | unsigned short ex_ee_len; |
2480 | unsigned uninitialized = 0; | 2493 | unsigned uninitialized = 0; |
2481 | struct ext4_extent *ex; | 2494 | struct ext4_extent *ex; |
2495 | ext4_fsblk_t pblk; | ||
2482 | 2496 | ||
2483 | /* the header must be checked already in ext4_ext_remove_space() */ | 2497 | /* the header must be checked already in ext4_ext_remove_space() */ |
2484 | ext_debug("truncate since %u in leaf to %u\n", start, end); | 2498 | ext_debug("truncate since %u in leaf to %u\n", start, end); |
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2490 | return -EIO; | 2504 | return -EIO; |
2491 | } | 2505 | } |
2492 | /* find where to start removing */ | 2506 | /* find where to start removing */ |
2493 | ex = EXT_LAST_EXTENT(eh); | 2507 | ex = path[depth].p_ext; |
2508 | if (!ex) | ||
2509 | ex = EXT_LAST_EXTENT(eh); | ||
2494 | 2510 | ||
2495 | ex_ee_block = le32_to_cpu(ex->ee_block); | 2511 | ex_ee_block = le32_to_cpu(ex->ee_block); |
2496 | ex_ee_len = ext4_ext_get_actual_len(ex); | 2512 | ex_ee_len = ext4_ext_get_actual_len(ex); |
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2517 | 2533 | ||
2518 | /* If this extent is beyond the end of the hole, skip it */ | 2534 | /* If this extent is beyond the end of the hole, skip it */ |
2519 | if (end < ex_ee_block) { | 2535 | if (end < ex_ee_block) { |
2536 | /* | ||
2537 | * We're going to skip this extent and move to another, | ||
2538 | * so if this extent is not cluster aligned we have | ||
2539 | * to mark the current cluster as used to avoid | ||
2540 | * accidentally freeing it later on | ||
2541 | */ | ||
2542 | pblk = ext4_ext_pblock(ex); | ||
2543 | if (pblk & (sbi->s_cluster_ratio - 1)) | ||
2544 | *partial_cluster = | ||
2545 | -((long long)EXT4_B2C(sbi, pblk)); | ||
2520 | ex--; | 2546 | ex--; |
2521 | ex_ee_block = le32_to_cpu(ex->ee_block); | 2547 | ex_ee_block = le32_to_cpu(ex->ee_block); |
2522 | ex_ee_len = ext4_ext_get_actual_len(ex); | 2548 | ex_ee_len = ext4_ext_get_actual_len(ex); |
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2592 | sizeof(struct ext4_extent)); | 2618 | sizeof(struct ext4_extent)); |
2593 | } | 2619 | } |
2594 | le16_add_cpu(&eh->eh_entries, -1); | 2620 | le16_add_cpu(&eh->eh_entries, -1); |
2595 | } else | 2621 | } else if (*partial_cluster > 0) |
2596 | *partial_cluster = 0; | 2622 | *partial_cluster = 0; |
2597 | 2623 | ||
2598 | err = ext4_ext_dirty(handle, inode, path + depth); | 2624 | err = ext4_ext_dirty(handle, inode, path + depth); |
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2610 | err = ext4_ext_correct_indexes(handle, inode, path); | 2636 | err = ext4_ext_correct_indexes(handle, inode, path); |
2611 | 2637 | ||
2612 | /* | 2638 | /* |
2613 | * If there is still a entry in the leaf node, check to see if | 2639 | * Free the partial cluster only if the current extent does not |
2614 | * it references the partial cluster. This is the only place | 2640 | * reference it. Otherwise we might free used cluster. |
2615 | * where it could; if it doesn't, we can free the cluster. | ||
2616 | */ | 2641 | */ |
2617 | if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && | 2642 | if (*partial_cluster > 0 && |
2618 | (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != | 2643 | (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != |
2619 | *partial_cluster)) { | 2644 | *partial_cluster)) { |
2620 | int flags = EXT4_FREE_BLOCKS_FORGET; | 2645 | int flags = get_default_free_blocks_flags(inode); |
2621 | |||
2622 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
2623 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
2624 | 2646 | ||
2625 | ext4_free_blocks(handle, inode, NULL, | 2647 | ext4_free_blocks(handle, inode, NULL, |
2626 | EXT4_C2B(sbi, *partial_cluster), | 2648 | EXT4_C2B(sbi, *partial_cluster), |
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, | |||
2664 | struct super_block *sb = inode->i_sb; | 2686 | struct super_block *sb = inode->i_sb; |
2665 | int depth = ext_depth(inode); | 2687 | int depth = ext_depth(inode); |
2666 | struct ext4_ext_path *path = NULL; | 2688 | struct ext4_ext_path *path = NULL; |
2667 | ext4_fsblk_t partial_cluster = 0; | 2689 | long long partial_cluster = 0; |
2668 | handle_t *handle; | 2690 | handle_t *handle; |
2669 | int i = 0, err = 0; | 2691 | int i = 0, err = 0; |
2670 | 2692 | ||
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, | |||
2676 | return PTR_ERR(handle); | 2698 | return PTR_ERR(handle); |
2677 | 2699 | ||
2678 | again: | 2700 | again: |
2679 | trace_ext4_ext_remove_space(inode, start, depth); | 2701 | trace_ext4_ext_remove_space(inode, start, end, depth); |
2680 | 2702 | ||
2681 | /* | 2703 | /* |
2682 | * Check if we are removing extents inside the extent tree. If that | 2704 | * Check if we are removing extents inside the extent tree. If that |
@@ -2844,17 +2866,14 @@ again: | |||
2844 | } | 2866 | } |
2845 | } | 2867 | } |
2846 | 2868 | ||
2847 | trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, | 2869 | trace_ext4_ext_remove_space_done(inode, start, end, depth, |
2848 | path->p_hdr->eh_entries); | 2870 | partial_cluster, path->p_hdr->eh_entries); |
2849 | 2871 | ||
2850 | /* If we still have something in the partial cluster and we have removed | 2872 | /* If we still have something in the partial cluster and we have removed |
2851 | * even the first extent, then we should free the blocks in the partial | 2873 | * even the first extent, then we should free the blocks in the partial |
2852 | * cluster as well. */ | 2874 | * cluster as well. */ |
2853 | if (partial_cluster && path->p_hdr->eh_entries == 0) { | 2875 | if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { |
2854 | int flags = EXT4_FREE_BLOCKS_FORGET; | 2876 | int flags = get_default_free_blocks_flags(inode); |
2855 | |||
2856 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
2857 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
2858 | 2877 | ||
2859 | ext4_free_blocks(handle, inode, NULL, | 2878 | ext4_free_blocks(handle, inode, NULL, |
2860 | EXT4_C2B(EXT4_SB(sb), partial_cluster), | 2879 | EXT4_C2B(EXT4_SB(sb), partial_cluster), |
@@ -4363,7 +4382,7 @@ out2: | |||
4363 | } | 4382 | } |
4364 | 4383 | ||
4365 | out3: | 4384 | out3: |
4366 | trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); | 4385 | trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); |
4367 | 4386 | ||
4368 | return err ? err : allocated; | 4387 | return err ? err : allocated; |
4369 | } | 4388 | } |
@@ -4446,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) | |||
4446 | return -EOPNOTSUPP; | 4465 | return -EOPNOTSUPP; |
4447 | 4466 | ||
4448 | if (mode & FALLOC_FL_PUNCH_HOLE) | 4467 | if (mode & FALLOC_FL_PUNCH_HOLE) |
4449 | return ext4_punch_hole(file, offset, len); | 4468 | return ext4_punch_hole(inode, offset, len); |
4450 | 4469 | ||
4451 | ret = ext4_convert_inline_data(inode); | 4470 | ret = ext4_convert_inline_data(inode); |
4452 | if (ret) | 4471 | if (ret) |
@@ -4548,10 +4567,9 @@ retry: | |||
4548 | * function, to convert the fallocated extents after IO is completed. | 4567 | * function, to convert the fallocated extents after IO is completed. |
4549 | * Returns 0 on success. | 4568 | * Returns 0 on success. |
4550 | */ | 4569 | */ |
4551 | int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, | 4570 | int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, |
4552 | ssize_t len) | 4571 | loff_t offset, ssize_t len) |
4553 | { | 4572 | { |
4554 | handle_t *handle; | ||
4555 | unsigned int max_blocks; | 4573 | unsigned int max_blocks; |
4556 | int ret = 0; | 4574 | int ret = 0; |
4557 | int ret2 = 0; | 4575 | int ret2 = 0; |
@@ -4566,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, | |||
4566 | max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - | 4584 | max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - |
4567 | map.m_lblk); | 4585 | map.m_lblk); |
4568 | /* | 4586 | /* |
4569 | * credits to insert 1 extent into extent tree | 4587 | * This is somewhat ugly but the idea is clear: When transaction is |
4588 | * reserved, everything goes into it. Otherwise we rather start several | ||
4589 | * smaller transactions for conversion of each extent separately. | ||
4570 | */ | 4590 | */ |
4571 | credits = ext4_chunk_trans_blocks(inode, max_blocks); | 4591 | if (handle) { |
4592 | handle = ext4_journal_start_reserved(handle, | ||
4593 | EXT4_HT_EXT_CONVERT); | ||
4594 | if (IS_ERR(handle)) | ||
4595 | return PTR_ERR(handle); | ||
4596 | credits = 0; | ||
4597 | } else { | ||
4598 | /* | ||
4599 | * credits to insert 1 extent into extent tree | ||
4600 | */ | ||
4601 | credits = ext4_chunk_trans_blocks(inode, max_blocks); | ||
4602 | } | ||
4572 | while (ret >= 0 && ret < max_blocks) { | 4603 | while (ret >= 0 && ret < max_blocks) { |
4573 | map.m_lblk += ret; | 4604 | map.m_lblk += ret; |
4574 | map.m_len = (max_blocks -= ret); | 4605 | map.m_len = (max_blocks -= ret); |
4575 | handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); | 4606 | if (credits) { |
4576 | if (IS_ERR(handle)) { | 4607 | handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, |
4577 | ret = PTR_ERR(handle); | 4608 | credits); |
4578 | break; | 4609 | if (IS_ERR(handle)) { |
4610 | ret = PTR_ERR(handle); | ||
4611 | break; | ||
4612 | } | ||
4579 | } | 4613 | } |
4580 | ret = ext4_map_blocks(handle, inode, &map, | 4614 | ret = ext4_map_blocks(handle, inode, &map, |
4581 | EXT4_GET_BLOCKS_IO_CONVERT_EXT); | 4615 | EXT4_GET_BLOCKS_IO_CONVERT_EXT); |
@@ -4586,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, | |||
4586 | inode->i_ino, map.m_lblk, | 4620 | inode->i_ino, map.m_lblk, |
4587 | map.m_len, ret); | 4621 | map.m_len, ret); |
4588 | ext4_mark_inode_dirty(handle, inode); | 4622 | ext4_mark_inode_dirty(handle, inode); |
4589 | ret2 = ext4_journal_stop(handle); | 4623 | if (credits) |
4590 | if (ret <= 0 || ret2 ) | 4624 | ret2 = ext4_journal_stop(handle); |
4625 | if (ret <= 0 || ret2) | ||
4591 | break; | 4626 | break; |
4592 | } | 4627 | } |
4628 | if (!credits) | ||
4629 | ret2 = ext4_journal_stop(handle); | ||
4593 | return ret > 0 ? ret2 : ret; | 4630 | return ret > 0 ? ret2 : ret; |
4594 | } | 4631 | } |
4595 | 4632 | ||
@@ -4659,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode, | |||
4659 | error = ext4_get_inode_loc(inode, &iloc); | 4696 | error = ext4_get_inode_loc(inode, &iloc); |
4660 | if (error) | 4697 | if (error) |
4661 | return error; | 4698 | return error; |
4662 | physical = iloc.bh->b_blocknr << blockbits; | 4699 | physical = (__u64)iloc.bh->b_blocknr << blockbits; |
4663 | offset = EXT4_GOOD_OLD_INODE_SIZE + | 4700 | offset = EXT4_GOOD_OLD_INODE_SIZE + |
4664 | EXT4_I(inode)->i_extra_isize; | 4701 | EXT4_I(inode)->i_extra_isize; |
4665 | physical += offset; | 4702 | physical += offset; |
@@ -4667,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode, | |||
4667 | flags |= FIEMAP_EXTENT_DATA_INLINE; | 4704 | flags |= FIEMAP_EXTENT_DATA_INLINE; |
4668 | brelse(iloc.bh); | 4705 | brelse(iloc.bh); |
4669 | } else { /* external block */ | 4706 | } else { /* external block */ |
4670 | physical = EXT4_I(inode)->i_file_acl << blockbits; | 4707 | physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; |
4671 | length = inode->i_sb->s_blocksize; | 4708 | length = inode->i_sb->s_blocksize; |
4672 | } | 4709 | } |
4673 | 4710 | ||
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e6941e622d31..ee018d5f397e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c | |||
@@ -10,6 +10,7 @@ | |||
10 | * Ext4 extents status tree core functions. | 10 | * Ext4 extents status tree core functions. |
11 | */ | 11 | */ |
12 | #include <linux/rbtree.h> | 12 | #include <linux/rbtree.h> |
13 | #include <linux/list_sort.h> | ||
13 | #include "ext4.h" | 14 | #include "ext4.h" |
14 | #include "extents_status.h" | 15 | #include "extents_status.h" |
15 | #include "ext4_extents.h" | 16 | #include "ext4_extents.h" |
@@ -291,7 +292,6 @@ out: | |||
291 | 292 | ||
292 | read_unlock(&EXT4_I(inode)->i_es_lock); | 293 | read_unlock(&EXT4_I(inode)->i_es_lock); |
293 | 294 | ||
294 | ext4_es_lru_add(inode); | ||
295 | trace_ext4_es_find_delayed_extent_range_exit(inode, es); | 295 | trace_ext4_es_find_delayed_extent_range_exit(inode, es); |
296 | } | 296 | } |
297 | 297 | ||
@@ -672,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, | |||
672 | error: | 672 | error: |
673 | write_unlock(&EXT4_I(inode)->i_es_lock); | 673 | write_unlock(&EXT4_I(inode)->i_es_lock); |
674 | 674 | ||
675 | ext4_es_lru_add(inode); | ||
676 | ext4_es_print_tree(inode); | 675 | ext4_es_print_tree(inode); |
677 | 676 | ||
678 | return err; | 677 | return err; |
@@ -734,7 +733,6 @@ out: | |||
734 | 733 | ||
735 | read_unlock(&EXT4_I(inode)->i_es_lock); | 734 | read_unlock(&EXT4_I(inode)->i_es_lock); |
736 | 735 | ||
737 | ext4_es_lru_add(inode); | ||
738 | trace_ext4_es_lookup_extent_exit(inode, es, found); | 736 | trace_ext4_es_lookup_extent_exit(inode, es, found); |
739 | return found; | 737 | return found; |
740 | } | 738 | } |
@@ -878,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) | |||
878 | EXTENT_STATUS_WRITTEN); | 876 | EXTENT_STATUS_WRITTEN); |
879 | } | 877 | } |
880 | 878 | ||
879 | static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, | ||
880 | struct list_head *b) | ||
881 | { | ||
882 | struct ext4_inode_info *eia, *eib; | ||
883 | eia = list_entry(a, struct ext4_inode_info, i_es_lru); | ||
884 | eib = list_entry(b, struct ext4_inode_info, i_es_lru); | ||
885 | |||
886 | if (eia->i_touch_when == eib->i_touch_when) | ||
887 | return 0; | ||
888 | if (time_after(eia->i_touch_when, eib->i_touch_when)) | ||
889 | return 1; | ||
890 | else | ||
891 | return -1; | ||
892 | } | ||
893 | |||
881 | static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) | 894 | static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) |
882 | { | 895 | { |
883 | struct ext4_sb_info *sbi = container_of(shrink, | 896 | struct ext4_sb_info *sbi = container_of(shrink, |
884 | struct ext4_sb_info, s_es_shrinker); | 897 | struct ext4_sb_info, s_es_shrinker); |
885 | struct ext4_inode_info *ei; | 898 | struct ext4_inode_info *ei; |
886 | struct list_head *cur, *tmp, scanned; | 899 | struct list_head *cur, *tmp; |
900 | LIST_HEAD(skiped); | ||
887 | int nr_to_scan = sc->nr_to_scan; | 901 | int nr_to_scan = sc->nr_to_scan; |
888 | int ret, nr_shrunk = 0; | 902 | int ret, nr_shrunk = 0; |
889 | 903 | ||
@@ -893,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
893 | if (!nr_to_scan) | 907 | if (!nr_to_scan) |
894 | return ret; | 908 | return ret; |
895 | 909 | ||
896 | INIT_LIST_HEAD(&scanned); | ||
897 | |||
898 | spin_lock(&sbi->s_es_lru_lock); | 910 | spin_lock(&sbi->s_es_lru_lock); |
911 | |||
912 | /* | ||
913 | * If the inode that is at the head of LRU list is newer than | ||
914 | * last_sorted time, that means that we need to sort this list. | ||
915 | */ | ||
916 | ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); | ||
917 | if (sbi->s_es_last_sorted < ei->i_touch_when) { | ||
918 | list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); | ||
919 | sbi->s_es_last_sorted = jiffies; | ||
920 | } | ||
921 | |||
899 | list_for_each_safe(cur, tmp, &sbi->s_es_lru) { | 922 | list_for_each_safe(cur, tmp, &sbi->s_es_lru) { |
900 | list_move_tail(cur, &scanned); | 923 | /* |
924 | * If we have already reclaimed all extents from extent | ||
925 | * status tree, just stop the loop immediately. | ||
926 | */ | ||
927 | if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) | ||
928 | break; | ||
901 | 929 | ||
902 | ei = list_entry(cur, struct ext4_inode_info, i_es_lru); | 930 | ei = list_entry(cur, struct ext4_inode_info, i_es_lru); |
903 | 931 | ||
904 | read_lock(&ei->i_es_lock); | 932 | /* Skip the inode that is newer than the last_sorted time */ |
905 | if (ei->i_es_lru_nr == 0) { | 933 | if (sbi->s_es_last_sorted < ei->i_touch_when) { |
906 | read_unlock(&ei->i_es_lock); | 934 | list_move_tail(cur, &skiped); |
907 | continue; | 935 | continue; |
908 | } | 936 | } |
909 | read_unlock(&ei->i_es_lock); | 937 | |
938 | if (ei->i_es_lru_nr == 0) | ||
939 | continue; | ||
910 | 940 | ||
911 | write_lock(&ei->i_es_lock); | 941 | write_lock(&ei->i_es_lock); |
912 | ret = __es_try_to_reclaim_extents(ei, nr_to_scan); | 942 | ret = __es_try_to_reclaim_extents(ei, nr_to_scan); |
943 | if (ei->i_es_lru_nr == 0) | ||
944 | list_del_init(&ei->i_es_lru); | ||
913 | write_unlock(&ei->i_es_lock); | 945 | write_unlock(&ei->i_es_lock); |
914 | 946 | ||
915 | nr_shrunk += ret; | 947 | nr_shrunk += ret; |
@@ -917,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
917 | if (nr_to_scan == 0) | 949 | if (nr_to_scan == 0) |
918 | break; | 950 | break; |
919 | } | 951 | } |
920 | list_splice_tail(&scanned, &sbi->s_es_lru); | 952 | |
953 | /* Move the newer inodes into the tail of the LRU list. */ | ||
954 | list_splice_tail(&skiped, &sbi->s_es_lru); | ||
921 | spin_unlock(&sbi->s_es_lru_lock); | 955 | spin_unlock(&sbi->s_es_lru_lock); |
922 | 956 | ||
923 | ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); | 957 | ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); |
@@ -925,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
925 | return ret; | 959 | return ret; |
926 | } | 960 | } |
927 | 961 | ||
928 | void ext4_es_register_shrinker(struct super_block *sb) | 962 | void ext4_es_register_shrinker(struct ext4_sb_info *sbi) |
929 | { | 963 | { |
930 | struct ext4_sb_info *sbi; | ||
931 | |||
932 | sbi = EXT4_SB(sb); | ||
933 | INIT_LIST_HEAD(&sbi->s_es_lru); | 964 | INIT_LIST_HEAD(&sbi->s_es_lru); |
934 | spin_lock_init(&sbi->s_es_lru_lock); | 965 | spin_lock_init(&sbi->s_es_lru_lock); |
966 | sbi->s_es_last_sorted = 0; | ||
935 | sbi->s_es_shrinker.shrink = ext4_es_shrink; | 967 | sbi->s_es_shrinker.shrink = ext4_es_shrink; |
936 | sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; | 968 | sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; |
937 | register_shrinker(&sbi->s_es_shrinker); | 969 | register_shrinker(&sbi->s_es_shrinker); |
938 | } | 970 | } |
939 | 971 | ||
940 | void ext4_es_unregister_shrinker(struct super_block *sb) | 972 | void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) |
941 | { | 973 | { |
942 | unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); | 974 | unregister_shrinker(&sbi->s_es_shrinker); |
943 | } | 975 | } |
944 | 976 | ||
945 | void ext4_es_lru_add(struct inode *inode) | 977 | void ext4_es_lru_add(struct inode *inode) |
@@ -947,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode) | |||
947 | struct ext4_inode_info *ei = EXT4_I(inode); | 979 | struct ext4_inode_info *ei = EXT4_I(inode); |
948 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 980 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
949 | 981 | ||
982 | ei->i_touch_when = jiffies; | ||
983 | |||
984 | if (!list_empty(&ei->i_es_lru)) | ||
985 | return; | ||
986 | |||
950 | spin_lock(&sbi->s_es_lru_lock); | 987 | spin_lock(&sbi->s_es_lru_lock); |
951 | if (list_empty(&ei->i_es_lru)) | 988 | if (list_empty(&ei->i_es_lru)) |
952 | list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); | 989 | list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); |
953 | else | ||
954 | list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); | ||
955 | spin_unlock(&sbi->s_es_lru_lock); | 990 | spin_unlock(&sbi->s_es_lru_lock); |
956 | } | 991 | } |
957 | 992 | ||
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f740eb03b707..e936730cc5b0 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h | |||
@@ -39,6 +39,7 @@ | |||
39 | EXTENT_STATUS_DELAYED | \ | 39 | EXTENT_STATUS_DELAYED | \ |
40 | EXTENT_STATUS_HOLE) | 40 | EXTENT_STATUS_HOLE) |
41 | 41 | ||
42 | struct ext4_sb_info; | ||
42 | struct ext4_extent; | 43 | struct ext4_extent; |
43 | 44 | ||
44 | struct extent_status { | 45 | struct extent_status { |
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es, | |||
119 | es->es_pblk = block; | 120 | es->es_pblk = block; |
120 | } | 121 | } |
121 | 122 | ||
122 | extern void ext4_es_register_shrinker(struct super_block *sb); | 123 | extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); |
123 | extern void ext4_es_unregister_shrinker(struct super_block *sb); | 124 | extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); |
124 | extern void ext4_es_lru_add(struct inode *inode); | 125 | extern void ext4_es_lru_add(struct inode *inode); |
125 | extern void ext4_es_lru_del(struct inode *inode); | 126 | extern void ext4_es_lru_del(struct inode *inode); |
126 | 127 | ||
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b1b4d51b5d86..b19f0a457f32 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, | |||
312 | blkbits = inode->i_sb->s_blocksize_bits; | 312 | blkbits = inode->i_sb->s_blocksize_bits; |
313 | startoff = *offset; | 313 | startoff = *offset; |
314 | lastoff = startoff; | 314 | lastoff = startoff; |
315 | endoff = (map->m_lblk + map->m_len) << blkbits; | 315 | endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; |
316 | 316 | ||
317 | index = startoff >> PAGE_CACHE_SHIFT; | 317 | index = startoff >> PAGE_CACHE_SHIFT; |
318 | end = endoff >> PAGE_CACHE_SHIFT; | 318 | end = endoff >> PAGE_CACHE_SHIFT; |
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) | |||
457 | ret = ext4_map_blocks(NULL, inode, &map, 0); | 457 | ret = ext4_map_blocks(NULL, inode, &map, 0); |
458 | if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { | 458 | if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { |
459 | if (last != start) | 459 | if (last != start) |
460 | dataoff = last << blkbits; | 460 | dataoff = (loff_t)last << blkbits; |
461 | break; | 461 | break; |
462 | } | 462 | } |
463 | 463 | ||
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) | |||
468 | ext4_es_find_delayed_extent_range(inode, last, last, &es); | 468 | ext4_es_find_delayed_extent_range(inode, last, last, &es); |
469 | if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { | 469 | if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { |
470 | if (last != start) | 470 | if (last != start) |
471 | dataoff = last << blkbits; | 471 | dataoff = (loff_t)last << blkbits; |
472 | break; | 472 | break; |
473 | } | 473 | } |
474 | 474 | ||
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) | |||
486 | } | 486 | } |
487 | 487 | ||
488 | last++; | 488 | last++; |
489 | dataoff = last << blkbits; | 489 | dataoff = (loff_t)last << blkbits; |
490 | } while (last <= end); | 490 | } while (last <= end); |
491 | 491 | ||
492 | mutex_unlock(&inode->i_mutex); | 492 | mutex_unlock(&inode->i_mutex); |
@@ -540,7 +540,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) | |||
540 | ret = ext4_map_blocks(NULL, inode, &map, 0); | 540 | ret = ext4_map_blocks(NULL, inode, &map, 0); |
541 | if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { | 541 | if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { |
542 | last += ret; | 542 | last += ret; |
543 | holeoff = last << blkbits; | 543 | holeoff = (loff_t)last << blkbits; |
544 | continue; | 544 | continue; |
545 | } | 545 | } |
546 | 546 | ||
@@ -551,7 +551,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) | |||
551 | ext4_es_find_delayed_extent_range(inode, last, last, &es); | 551 | ext4_es_find_delayed_extent_range(inode, last, last, &es); |
552 | if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { | 552 | if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { |
553 | last = es.es_lblk + es.es_len; | 553 | last = es.es_lblk + es.es_len; |
554 | holeoff = last << blkbits; | 554 | holeoff = (loff_t)last << blkbits; |
555 | continue; | 555 | continue; |
556 | } | 556 | } |
557 | 557 | ||
@@ -566,7 +566,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) | |||
566 | &map, &holeoff); | 566 | &map, &holeoff); |
567 | if (!unwritten) { | 567 | if (!unwritten) { |
568 | last += ret; | 568 | last += ret; |
569 | holeoff = last << blkbits; | 569 | holeoff = (loff_t)last << blkbits; |
570 | continue; | 570 | continue; |
571 | } | 571 | } |
572 | } | 572 | } |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e0ba8a408def..a8bc47f75fa0 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode) | |||
73 | return ret; | 73 | return ret; |
74 | } | 74 | } |
75 | 75 | ||
76 | /** | ||
77 | * __sync_file - generic_file_fsync without the locking and filemap_write | ||
78 | * @inode: inode to sync | ||
79 | * @datasync: only sync essential metadata if true | ||
80 | * | ||
81 | * This is just generic_file_fsync without the locking. This is needed for | ||
82 | * nojournal mode to make sure this inodes data/metadata makes it to disk | ||
83 | * properly. The i_mutex should be held already. | ||
84 | */ | ||
85 | static int __sync_inode(struct inode *inode, int datasync) | ||
86 | { | ||
87 | int err; | ||
88 | int ret; | ||
89 | |||
90 | ret = sync_mapping_buffers(inode->i_mapping); | ||
91 | if (!(inode->i_state & I_DIRTY)) | ||
92 | return ret; | ||
93 | if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) | ||
94 | return ret; | ||
95 | |||
96 | err = sync_inode_metadata(inode, 1); | ||
97 | if (ret == 0) | ||
98 | ret = err; | ||
99 | return ret; | ||
100 | } | ||
101 | |||
102 | /* | 76 | /* |
103 | * akpm: A new design for ext4_sync_file(). | 77 | * akpm: A new design for ext4_sync_file(). |
104 | * | 78 | * |
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
116 | struct inode *inode = file->f_mapping->host; | 90 | struct inode *inode = file->f_mapping->host; |
117 | struct ext4_inode_info *ei = EXT4_I(inode); | 91 | struct ext4_inode_info *ei = EXT4_I(inode); |
118 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | 92 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; |
119 | int ret, err; | 93 | int ret = 0, err; |
120 | tid_t commit_tid; | 94 | tid_t commit_tid; |
121 | bool needs_barrier = false; | 95 | bool needs_barrier = false; |
122 | 96 | ||
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
124 | 98 | ||
125 | trace_ext4_sync_file_enter(file, datasync); | 99 | trace_ext4_sync_file_enter(file, datasync); |
126 | 100 | ||
127 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 101 | if (inode->i_sb->s_flags & MS_RDONLY) { |
128 | if (ret) | 102 | /* Make sure that we read updated s_mount_flags value */ |
129 | return ret; | 103 | smp_rmb(); |
130 | mutex_lock(&inode->i_mutex); | 104 | if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) |
131 | 105 | ret = -EROFS; | |
132 | if (inode->i_sb->s_flags & MS_RDONLY) | ||
133 | goto out; | ||
134 | |||
135 | ret = ext4_flush_unwritten_io(inode); | ||
136 | if (ret < 0) | ||
137 | goto out; | 106 | goto out; |
107 | } | ||
138 | 108 | ||
139 | if (!journal) { | 109 | if (!journal) { |
140 | ret = __sync_inode(inode, datasync); | 110 | ret = generic_file_fsync(file, start, end, datasync); |
141 | if (!ret && !hlist_empty(&inode->i_dentry)) | 111 | if (!ret && !hlist_empty(&inode->i_dentry)) |
142 | ret = ext4_sync_parent(inode); | 112 | ret = ext4_sync_parent(inode); |
143 | goto out; | 113 | goto out; |
144 | } | 114 | } |
145 | 115 | ||
116 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | ||
117 | if (ret) | ||
118 | return ret; | ||
146 | /* | 119 | /* |
147 | * data=writeback,ordered: | 120 | * data=writeback,ordered: |
148 | * The caller's filemap_fdatawrite()/wait will sync the data. | 121 | * The caller's filemap_fdatawrite()/wait will sync the data. |
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
172 | if (!ret) | 145 | if (!ret) |
173 | ret = err; | 146 | ret = err; |
174 | } | 147 | } |
175 | out: | 148 | out: |
176 | mutex_unlock(&inode->i_mutex); | ||
177 | trace_ext4_sync_file_exit(inode, ret); | 149 | trace_ext4_sync_file_exit(inode, ret); |
178 | return ret; | 150 | return ret; |
179 | } | 151 | } |
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 00a818d67b54..f03598c6ffd3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -747,7 +747,8 @@ repeat_in_this_group: | |||
747 | if (!handle) { | 747 | if (!handle) { |
748 | BUG_ON(nblocks <= 0); | 748 | BUG_ON(nblocks <= 0); |
749 | handle = __ext4_journal_start_sb(dir->i_sb, line_no, | 749 | handle = __ext4_journal_start_sb(dir->i_sb, line_no, |
750 | handle_type, nblocks); | 750 | handle_type, nblocks, |
751 | 0); | ||
751 | if (IS_ERR(handle)) { | 752 | if (IS_ERR(handle)) { |
752 | err = PTR_ERR(handle); | 753 | err = PTR_ERR(handle); |
753 | ext4_std_error(sb, err); | 754 | ext4_std_error(sb, err); |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index b8d5d351e24f..87b30cd357e7 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c | |||
@@ -624,7 +624,7 @@ cleanup: | |||
624 | partial--; | 624 | partial--; |
625 | } | 625 | } |
626 | out: | 626 | out: |
627 | trace_ext4_ind_map_blocks_exit(inode, map, err); | 627 | trace_ext4_ind_map_blocks_exit(inode, flags, map, err); |
628 | return err; | 628 | return err; |
629 | } | 629 | } |
630 | 630 | ||
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | |||
675 | 675 | ||
676 | retry: | 676 | retry: |
677 | if (rw == READ && ext4_should_dioread_nolock(inode)) { | 677 | if (rw == READ && ext4_should_dioread_nolock(inode)) { |
678 | if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { | ||
679 | mutex_lock(&inode->i_mutex); | ||
680 | ext4_flush_unwritten_io(inode); | ||
681 | mutex_unlock(&inode->i_mutex); | ||
682 | } | ||
683 | /* | 678 | /* |
684 | * Nolock dioread optimization may be dynamically disabled | 679 | * Nolock dioread optimization may be dynamically disabled |
685 | * via ext4_inode_block_unlocked_dio(). Check inode's state | 680 | * via ext4_inode_block_unlocked_dio(). Check inode's state |
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) | |||
779 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | 774 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; |
780 | } | 775 | } |
781 | 776 | ||
782 | int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 777 | /* |
778 | * Calculate number of indirect blocks touched by mapping @nrblocks logically | ||
779 | * contiguous blocks | ||
780 | */ | ||
781 | int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) | ||
783 | { | 782 | { |
784 | int indirects; | ||
785 | |||
786 | /* if nrblocks are contiguous */ | ||
787 | if (chunk) { | ||
788 | /* | ||
789 | * With N contiguous data blocks, we need at most | ||
790 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | ||
791 | * 2 dindirect blocks, and 1 tindirect block | ||
792 | */ | ||
793 | return DIV_ROUND_UP(nrblocks, | ||
794 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | ||
795 | } | ||
796 | /* | 783 | /* |
797 | * if nrblocks are not contiguous, worse case, each block touch | 784 | * With N contiguous data blocks, we need at most |
798 | * a indirect block, and each indirect block touch a double indirect | 785 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, |
799 | * block, plus a triple indirect block | 786 | * 2 dindirect blocks, and 1 tindirect block |
800 | */ | 787 | */ |
801 | indirects = nrblocks * 2 + 1; | 788 | return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; |
802 | return indirects; | ||
803 | } | 789 | } |
804 | 790 | ||
805 | /* | 791 | /* |
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | |||
940 | __le32 *last) | 926 | __le32 *last) |
941 | { | 927 | { |
942 | __le32 *p; | 928 | __le32 *p; |
943 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | 929 | int flags = EXT4_FREE_BLOCKS_VALIDATED; |
944 | int err; | 930 | int err; |
945 | 931 | ||
946 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | 932 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) |
947 | flags |= EXT4_FREE_BLOCKS_METADATA; | 933 | flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; |
934 | else if (ext4_should_journal_data(inode)) | ||
935 | flags |= EXT4_FREE_BLOCKS_FORGET; | ||
948 | 936 | ||
949 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | 937 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, |
950 | count)) { | 938 | count)) { |
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1a346a6bdc8f..d9ecbf1113a7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c | |||
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, | |||
72 | entry = (struct ext4_xattr_entry *) | 72 | entry = (struct ext4_xattr_entry *) |
73 | ((void *)raw_inode + EXT4_I(inode)->i_inline_off); | 73 | ((void *)raw_inode + EXT4_I(inode)->i_inline_off); |
74 | 74 | ||
75 | free += le32_to_cpu(entry->e_value_size); | 75 | free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); |
76 | goto out; | 76 | goto out; |
77 | } | 77 | } |
78 | 78 | ||
@@ -1810,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode, | |||
1810 | if (error) | 1810 | if (error) |
1811 | goto out; | 1811 | goto out; |
1812 | 1812 | ||
1813 | physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; | 1813 | physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; |
1814 | physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; | 1814 | physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; |
1815 | physical += offsetof(struct ext4_inode, i_block); | 1815 | physical += offsetof(struct ext4_inode, i_block); |
1816 | length = i_size_read(inode); | 1816 | length = i_size_read(inode); |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6382b89ecbd..0188e65e1f58 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, | |||
132 | new_size); | 132 | new_size); |
133 | } | 133 | } |
134 | 134 | ||
135 | static void ext4_invalidatepage(struct page *page, unsigned long offset); | 135 | static void ext4_invalidatepage(struct page *page, unsigned int offset, |
136 | unsigned int length); | ||
136 | static int __ext4_journalled_writepage(struct page *page, unsigned int len); | 137 | static int __ext4_journalled_writepage(struct page *page, unsigned int len); |
137 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); | 138 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); |
138 | static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, | 139 | static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, |
139 | struct inode *inode, struct page *page, loff_t from, | 140 | int pextents); |
140 | loff_t length, int flags); | ||
141 | 141 | ||
142 | /* | 142 | /* |
143 | * Test whether an inode is a fast symlink. | 143 | * Test whether an inode is a fast symlink. |
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode) | |||
215 | filemap_write_and_wait(&inode->i_data); | 215 | filemap_write_and_wait(&inode->i_data); |
216 | } | 216 | } |
217 | truncate_inode_pages(&inode->i_data, 0); | 217 | truncate_inode_pages(&inode->i_data, 0); |
218 | ext4_ioend_shutdown(inode); | 218 | |
219 | WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); | ||
219 | goto no_delete; | 220 | goto no_delete; |
220 | } | 221 | } |
221 | 222 | ||
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode) | |||
225 | if (ext4_should_order_data(inode)) | 226 | if (ext4_should_order_data(inode)) |
226 | ext4_begin_ordered_truncate(inode, 0); | 227 | ext4_begin_ordered_truncate(inode, 0); |
227 | truncate_inode_pages(&inode->i_data, 0); | 228 | truncate_inode_pages(&inode->i_data, 0); |
228 | ext4_ioend_shutdown(inode); | ||
229 | 229 | ||
230 | WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); | ||
230 | if (is_bad_inode(inode)) | 231 | if (is_bad_inode(inode)) |
231 | goto no_delete; | 232 | goto no_delete; |
232 | 233 | ||
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func, | |||
423 | #define check_block_validity(inode, map) \ | 424 | #define check_block_validity(inode, map) \ |
424 | __check_block_validity((inode), __func__, __LINE__, (map)) | 425 | __check_block_validity((inode), __func__, __LINE__, (map)) |
425 | 426 | ||
426 | /* | ||
427 | * Return the number of contiguous dirty pages in a given inode | ||
428 | * starting at page frame idx. | ||
429 | */ | ||
430 | static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, | ||
431 | unsigned int max_pages) | ||
432 | { | ||
433 | struct address_space *mapping = inode->i_mapping; | ||
434 | pgoff_t index; | ||
435 | struct pagevec pvec; | ||
436 | pgoff_t num = 0; | ||
437 | int i, nr_pages, done = 0; | ||
438 | |||
439 | if (max_pages == 0) | ||
440 | return 0; | ||
441 | pagevec_init(&pvec, 0); | ||
442 | while (!done) { | ||
443 | index = idx; | ||
444 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
445 | PAGECACHE_TAG_DIRTY, | ||
446 | (pgoff_t)PAGEVEC_SIZE); | ||
447 | if (nr_pages == 0) | ||
448 | break; | ||
449 | for (i = 0; i < nr_pages; i++) { | ||
450 | struct page *page = pvec.pages[i]; | ||
451 | struct buffer_head *bh, *head; | ||
452 | |||
453 | lock_page(page); | ||
454 | if (unlikely(page->mapping != mapping) || | ||
455 | !PageDirty(page) || | ||
456 | PageWriteback(page) || | ||
457 | page->index != idx) { | ||
458 | done = 1; | ||
459 | unlock_page(page); | ||
460 | break; | ||
461 | } | ||
462 | if (page_has_buffers(page)) { | ||
463 | bh = head = page_buffers(page); | ||
464 | do { | ||
465 | if (!buffer_delay(bh) && | ||
466 | !buffer_unwritten(bh)) | ||
467 | done = 1; | ||
468 | bh = bh->b_this_page; | ||
469 | } while (!done && (bh != head)); | ||
470 | } | ||
471 | unlock_page(page); | ||
472 | if (done) | ||
473 | break; | ||
474 | idx++; | ||
475 | num++; | ||
476 | if (num >= max_pages) { | ||
477 | done = 1; | ||
478 | break; | ||
479 | } | ||
480 | } | ||
481 | pagevec_release(&pvec); | ||
482 | } | ||
483 | return num; | ||
484 | } | ||
485 | |||
486 | #ifdef ES_AGGRESSIVE_TEST | 427 | #ifdef ES_AGGRESSIVE_TEST |
487 | static void ext4_map_blocks_es_recheck(handle_t *handle, | 428 | static void ext4_map_blocks_es_recheck(handle_t *handle, |
488 | struct inode *inode, | 429 | struct inode *inode, |
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, | |||
573 | "logical block %lu\n", inode->i_ino, flags, map->m_len, | 514 | "logical block %lu\n", inode->i_ino, flags, map->m_len, |
574 | (unsigned long) map->m_lblk); | 515 | (unsigned long) map->m_lblk); |
575 | 516 | ||
517 | ext4_es_lru_add(inode); | ||
518 | |||
576 | /* Lookup extent status tree firstly */ | 519 | /* Lookup extent status tree firstly */ |
577 | if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { | 520 | if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { |
578 | if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { | 521 | if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { |
@@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file, | |||
1118 | } | 1061 | } |
1119 | } | 1062 | } |
1120 | 1063 | ||
1121 | if (ext4_has_inline_data(inode)) | 1064 | if (ext4_has_inline_data(inode)) { |
1122 | copied = ext4_write_inline_data_end(inode, pos, len, | 1065 | ret = ext4_write_inline_data_end(inode, pos, len, |
1123 | copied, page); | 1066 | copied, page); |
1124 | else | 1067 | if (ret < 0) |
1068 | goto errout; | ||
1069 | copied = ret; | ||
1070 | } else | ||
1125 | copied = block_write_end(file, mapping, pos, | 1071 | copied = block_write_end(file, mapping, pos, |
1126 | len, copied, page, fsdata); | 1072 | len, copied, page, fsdata); |
1127 | 1073 | ||
@@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file, | |||
1157 | if (i_size_changed) | 1103 | if (i_size_changed) |
1158 | ext4_mark_inode_dirty(handle, inode); | 1104 | ext4_mark_inode_dirty(handle, inode); |
1159 | 1105 | ||
1160 | if (copied < 0) | ||
1161 | ret = copied; | ||
1162 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) | 1106 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1163 | /* if we have allocated more blocks and copied | 1107 | /* if we have allocated more blocks and copied |
1164 | * less. We will have blocks allocated outside | 1108 | * less. We will have blocks allocated outside |
@@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free) | |||
1415 | } | 1359 | } |
1416 | 1360 | ||
1417 | static void ext4_da_page_release_reservation(struct page *page, | 1361 | static void ext4_da_page_release_reservation(struct page *page, |
1418 | unsigned long offset) | 1362 | unsigned int offset, |
1363 | unsigned int length) | ||
1419 | { | 1364 | { |
1420 | int to_release = 0; | 1365 | int to_release = 0; |
1421 | struct buffer_head *head, *bh; | 1366 | struct buffer_head *head, *bh; |
1422 | unsigned int curr_off = 0; | 1367 | unsigned int curr_off = 0; |
1423 | struct inode *inode = page->mapping->host; | 1368 | struct inode *inode = page->mapping->host; |
1424 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1369 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1370 | unsigned int stop = offset + length; | ||
1425 | int num_clusters; | 1371 | int num_clusters; |
1426 | ext4_fsblk_t lblk; | 1372 | ext4_fsblk_t lblk; |
1427 | 1373 | ||
1374 | BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); | ||
1375 | |||
1428 | head = page_buffers(page); | 1376 | head = page_buffers(page); |
1429 | bh = head; | 1377 | bh = head; |
1430 | do { | 1378 | do { |
1431 | unsigned int next_off = curr_off + bh->b_size; | 1379 | unsigned int next_off = curr_off + bh->b_size; |
1432 | 1380 | ||
1381 | if (next_off > stop) | ||
1382 | break; | ||
1383 | |||
1433 | if ((offset <= curr_off) && (buffer_delay(bh))) { | 1384 | if ((offset <= curr_off) && (buffer_delay(bh))) { |
1434 | to_release++; | 1385 | to_release++; |
1435 | clear_buffer_delay(bh); | 1386 | clear_buffer_delay(bh); |
@@ -1460,140 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
1460 | * Delayed allocation stuff | 1411 | * Delayed allocation stuff |
1461 | */ | 1412 | */ |
1462 | 1413 | ||
1463 | /* | 1414 | struct mpage_da_data { |
1464 | * mpage_da_submit_io - walks through extent of pages and try to write | 1415 | struct inode *inode; |
1465 | * them with writepage() call back | 1416 | struct writeback_control *wbc; |
1466 | * | ||
1467 | * @mpd->inode: inode | ||
1468 | * @mpd->first_page: first page of the extent | ||
1469 | * @mpd->next_page: page after the last page of the extent | ||
1470 | * | ||
1471 | * By the time mpage_da_submit_io() is called we expect all blocks | ||
1472 | * to be allocated. this may be wrong if allocation failed. | ||
1473 | * | ||
1474 | * As pages are already locked by write_cache_pages(), we can't use it | ||
1475 | */ | ||
1476 | static int mpage_da_submit_io(struct mpage_da_data *mpd, | ||
1477 | struct ext4_map_blocks *map) | ||
1478 | { | ||
1479 | struct pagevec pvec; | ||
1480 | unsigned long index, end; | ||
1481 | int ret = 0, err, nr_pages, i; | ||
1482 | struct inode *inode = mpd->inode; | ||
1483 | struct address_space *mapping = inode->i_mapping; | ||
1484 | loff_t size = i_size_read(inode); | ||
1485 | unsigned int len, block_start; | ||
1486 | struct buffer_head *bh, *page_bufs = NULL; | ||
1487 | sector_t pblock = 0, cur_logical = 0; | ||
1488 | struct ext4_io_submit io_submit; | ||
1489 | 1417 | ||
1490 | BUG_ON(mpd->next_page <= mpd->first_page); | 1418 | pgoff_t first_page; /* The first page to write */ |
1491 | memset(&io_submit, 0, sizeof(io_submit)); | 1419 | pgoff_t next_page; /* Current page to examine */ |
1420 | pgoff_t last_page; /* Last page to examine */ | ||
1492 | /* | 1421 | /* |
1493 | * We need to start from the first_page to the next_page - 1 | 1422 | * Extent to map - this can be after first_page because that can be |
1494 | * to make sure we also write the mapped dirty buffer_heads. | 1423 | * fully mapped. We somewhat abuse m_flags to store whether the extent |
1495 | * If we look at mpd->b_blocknr we would only be looking | 1424 | * is delalloc or unwritten. |
1496 | * at the currently mapped buffer_heads. | ||
1497 | */ | 1425 | */ |
1498 | index = mpd->first_page; | 1426 | struct ext4_map_blocks map; |
1499 | end = mpd->next_page - 1; | 1427 | struct ext4_io_submit io_submit; /* IO submission data */ |
1500 | 1428 | }; | |
1501 | pagevec_init(&pvec, 0); | ||
1502 | while (index <= end) { | ||
1503 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | ||
1504 | if (nr_pages == 0) | ||
1505 | break; | ||
1506 | for (i = 0; i < nr_pages; i++) { | ||
1507 | int skip_page = 0; | ||
1508 | struct page *page = pvec.pages[i]; | ||
1509 | |||
1510 | index = page->index; | ||
1511 | if (index > end) | ||
1512 | break; | ||
1513 | |||
1514 | if (index == size >> PAGE_CACHE_SHIFT) | ||
1515 | len = size & ~PAGE_CACHE_MASK; | ||
1516 | else | ||
1517 | len = PAGE_CACHE_SIZE; | ||
1518 | if (map) { | ||
1519 | cur_logical = index << (PAGE_CACHE_SHIFT - | ||
1520 | inode->i_blkbits); | ||
1521 | pblock = map->m_pblk + (cur_logical - | ||
1522 | map->m_lblk); | ||
1523 | } | ||
1524 | index++; | ||
1525 | |||
1526 | BUG_ON(!PageLocked(page)); | ||
1527 | BUG_ON(PageWriteback(page)); | ||
1528 | |||
1529 | bh = page_bufs = page_buffers(page); | ||
1530 | block_start = 0; | ||
1531 | do { | ||
1532 | if (map && (cur_logical >= map->m_lblk) && | ||
1533 | (cur_logical <= (map->m_lblk + | ||
1534 | (map->m_len - 1)))) { | ||
1535 | if (buffer_delay(bh)) { | ||
1536 | clear_buffer_delay(bh); | ||
1537 | bh->b_blocknr = pblock; | ||
1538 | } | ||
1539 | if (buffer_unwritten(bh) || | ||
1540 | buffer_mapped(bh)) | ||
1541 | BUG_ON(bh->b_blocknr != pblock); | ||
1542 | if (map->m_flags & EXT4_MAP_UNINIT) | ||
1543 | set_buffer_uninit(bh); | ||
1544 | clear_buffer_unwritten(bh); | ||
1545 | } | ||
1546 | |||
1547 | /* | ||
1548 | * skip page if block allocation undone and | ||
1549 | * block is dirty | ||
1550 | */ | ||
1551 | if (ext4_bh_delay_or_unwritten(NULL, bh)) | ||
1552 | skip_page = 1; | ||
1553 | bh = bh->b_this_page; | ||
1554 | block_start += bh->b_size; | ||
1555 | cur_logical++; | ||
1556 | pblock++; | ||
1557 | } while (bh != page_bufs); | ||
1558 | |||
1559 | if (skip_page) { | ||
1560 | unlock_page(page); | ||
1561 | continue; | ||
1562 | } | ||
1563 | |||
1564 | clear_page_dirty_for_io(page); | ||
1565 | err = ext4_bio_write_page(&io_submit, page, len, | ||
1566 | mpd->wbc); | ||
1567 | if (!err) | ||
1568 | mpd->pages_written++; | ||
1569 | /* | ||
1570 | * In error case, we have to continue because | ||
1571 | * remaining pages are still locked | ||
1572 | */ | ||
1573 | if (ret == 0) | ||
1574 | ret = err; | ||
1575 | } | ||
1576 | pagevec_release(&pvec); | ||
1577 | } | ||
1578 | ext4_io_submit(&io_submit); | ||
1579 | return ret; | ||
1580 | } | ||
1581 | 1429 | ||
1582 | static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) | 1430 | static void mpage_release_unused_pages(struct mpage_da_data *mpd, |
1431 | bool invalidate) | ||
1583 | { | 1432 | { |
1584 | int nr_pages, i; | 1433 | int nr_pages, i; |
1585 | pgoff_t index, end; | 1434 | pgoff_t index, end; |
1586 | struct pagevec pvec; | 1435 | struct pagevec pvec; |
1587 | struct inode *inode = mpd->inode; | 1436 | struct inode *inode = mpd->inode; |
1588 | struct address_space *mapping = inode->i_mapping; | 1437 | struct address_space *mapping = inode->i_mapping; |
1589 | ext4_lblk_t start, last; | 1438 | |
1439 | /* This is necessary when next_page == 0. */ | ||
1440 | if (mpd->first_page >= mpd->next_page) | ||
1441 | return; | ||
1590 | 1442 | ||
1591 | index = mpd->first_page; | 1443 | index = mpd->first_page; |
1592 | end = mpd->next_page - 1; | 1444 | end = mpd->next_page - 1; |
1593 | 1445 | if (invalidate) { | |
1594 | start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 1446 | ext4_lblk_t start, last; |
1595 | last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 1447 | start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
1596 | ext4_es_remove_extent(inode, start, last - start + 1); | 1448 | last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
1449 | ext4_es_remove_extent(inode, start, last - start + 1); | ||
1450 | } | ||
1597 | 1451 | ||
1598 | pagevec_init(&pvec, 0); | 1452 | pagevec_init(&pvec, 0); |
1599 | while (index <= end) { | 1453 | while (index <= end) { |
@@ -1606,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) | |||
1606 | break; | 1460 | break; |
1607 | BUG_ON(!PageLocked(page)); | 1461 | BUG_ON(!PageLocked(page)); |
1608 | BUG_ON(PageWriteback(page)); | 1462 | BUG_ON(PageWriteback(page)); |
1609 | block_invalidatepage(page, 0); | 1463 | if (invalidate) { |
1610 | ClearPageUptodate(page); | 1464 | block_invalidatepage(page, 0, PAGE_CACHE_SIZE); |
1465 | ClearPageUptodate(page); | ||
1466 | } | ||
1611 | unlock_page(page); | 1467 | unlock_page(page); |
1612 | } | 1468 | } |
1613 | index = pvec.pages[nr_pages - 1]->index + 1; | 1469 | index = pvec.pages[nr_pages - 1]->index + 1; |
1614 | pagevec_release(&pvec); | 1470 | pagevec_release(&pvec); |
1615 | } | 1471 | } |
1616 | return; | ||
1617 | } | 1472 | } |
1618 | 1473 | ||
1619 | static void ext4_print_free_blocks(struct inode *inode) | 1474 | static void ext4_print_free_blocks(struct inode *inode) |
@@ -1642,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode) | |||
1642 | return; | 1497 | return; |
1643 | } | 1498 | } |
1644 | 1499 | ||
1645 | /* | ||
1646 | * mpage_da_map_and_submit - go through given space, map them | ||
1647 | * if necessary, and then submit them for I/O | ||
1648 | * | ||
1649 | * @mpd - bh describing space | ||
1650 | * | ||
1651 | * The function skips space we know is already mapped to disk blocks. | ||
1652 | * | ||
1653 | */ | ||
1654 | static void mpage_da_map_and_submit(struct mpage_da_data *mpd) | ||
1655 | { | ||
1656 | int err, blks, get_blocks_flags; | ||
1657 | struct ext4_map_blocks map, *mapp = NULL; | ||
1658 | sector_t next = mpd->b_blocknr; | ||
1659 | unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; | ||
1660 | loff_t disksize = EXT4_I(mpd->inode)->i_disksize; | ||
1661 | handle_t *handle = NULL; | ||
1662 | |||
1663 | /* | ||
1664 | * If the blocks are mapped already, or we couldn't accumulate | ||
1665 | * any blocks, then proceed immediately to the submission stage. | ||
1666 | */ | ||
1667 | if ((mpd->b_size == 0) || | ||
1668 | ((mpd->b_state & (1 << BH_Mapped)) && | ||
1669 | !(mpd->b_state & (1 << BH_Delay)) && | ||
1670 | !(mpd->b_state & (1 << BH_Unwritten)))) | ||
1671 | goto submit_io; | ||
1672 | |||
1673 | handle = ext4_journal_current_handle(); | ||
1674 | BUG_ON(!handle); | ||
1675 | |||
1676 | /* | ||
1677 | * Call ext4_map_blocks() to allocate any delayed allocation | ||
1678 | * blocks, or to convert an uninitialized extent to be | ||
1679 | * initialized (in the case where we have written into | ||
1680 | * one or more preallocated blocks). | ||
1681 | * | ||
1682 | * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to | ||
1683 | * indicate that we are on the delayed allocation path. This | ||
1684 | * affects functions in many different parts of the allocation | ||
1685 | * call path. This flag exists primarily because we don't | ||
1686 | * want to change *many* call functions, so ext4_map_blocks() | ||
1687 | * will set the EXT4_STATE_DELALLOC_RESERVED flag once the | ||
1688 | * inode's allocation semaphore is taken. | ||
1689 | * | ||
1690 | * If the blocks in questions were delalloc blocks, set | ||
1691 | * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting | ||
1692 | * variables are updated after the blocks have been allocated. | ||
1693 | */ | ||
1694 | map.m_lblk = next; | ||
1695 | map.m_len = max_blocks; | ||
1696 | /* | ||
1697 | * We're in delalloc path and it is possible that we're going to | ||
1698 | * need more metadata blocks than previously reserved. However | ||
1699 | * we must not fail because we're in writeback and there is | ||
1700 | * nothing we can do about it so it might result in data loss. | ||
1701 | * So use reserved blocks to allocate metadata if possible. | ||
1702 | */ | ||
1703 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE | | ||
1704 | EXT4_GET_BLOCKS_METADATA_NOFAIL; | ||
1705 | if (ext4_should_dioread_nolock(mpd->inode)) | ||
1706 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; | ||
1707 | if (mpd->b_state & (1 << BH_Delay)) | ||
1708 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; | ||
1709 | |||
1710 | |||
1711 | blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); | ||
1712 | if (blks < 0) { | ||
1713 | struct super_block *sb = mpd->inode->i_sb; | ||
1714 | |||
1715 | err = blks; | ||
1716 | /* | ||
1717 | * If get block returns EAGAIN or ENOSPC and there | ||
1718 | * appears to be free blocks we will just let | ||
1719 | * mpage_da_submit_io() unlock all of the pages. | ||
1720 | */ | ||
1721 | if (err == -EAGAIN) | ||
1722 | goto submit_io; | ||
1723 | |||
1724 | if (err == -ENOSPC && ext4_count_free_clusters(sb)) { | ||
1725 | mpd->retval = err; | ||
1726 | goto submit_io; | ||
1727 | } | ||
1728 | |||
1729 | /* | ||
1730 | * get block failure will cause us to loop in | ||
1731 | * writepages, because a_ops->writepage won't be able | ||
1732 | * to make progress. The page will be redirtied by | ||
1733 | * writepage and writepages will again try to write | ||
1734 | * the same. | ||
1735 | */ | ||
1736 | if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { | ||
1737 | ext4_msg(sb, KERN_CRIT, | ||
1738 | "delayed block allocation failed for inode %lu " | ||
1739 | "at logical offset %llu with max blocks %zd " | ||
1740 | "with error %d", mpd->inode->i_ino, | ||
1741 | (unsigned long long) next, | ||
1742 | mpd->b_size >> mpd->inode->i_blkbits, err); | ||
1743 | ext4_msg(sb, KERN_CRIT, | ||
1744 | "This should not happen!! Data will be lost"); | ||
1745 | if (err == -ENOSPC) | ||
1746 | ext4_print_free_blocks(mpd->inode); | ||
1747 | } | ||
1748 | /* invalidate all the pages */ | ||
1749 | ext4_da_block_invalidatepages(mpd); | ||
1750 | |||
1751 | /* Mark this page range as having been completed */ | ||
1752 | mpd->io_done = 1; | ||
1753 | return; | ||
1754 | } | ||
1755 | BUG_ON(blks == 0); | ||
1756 | |||
1757 | mapp = ↦ | ||
1758 | if (map.m_flags & EXT4_MAP_NEW) { | ||
1759 | struct block_device *bdev = mpd->inode->i_sb->s_bdev; | ||
1760 | int i; | ||
1761 | |||
1762 | for (i = 0; i < map.m_len; i++) | ||
1763 | unmap_underlying_metadata(bdev, map.m_pblk + i); | ||
1764 | } | ||
1765 | |||
1766 | /* | ||
1767 | * Update on-disk size along with block allocation. | ||
1768 | */ | ||
1769 | disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; | ||
1770 | if (disksize > i_size_read(mpd->inode)) | ||
1771 | disksize = i_size_read(mpd->inode); | ||
1772 | if (disksize > EXT4_I(mpd->inode)->i_disksize) { | ||
1773 | ext4_update_i_disksize(mpd->inode, disksize); | ||
1774 | err = ext4_mark_inode_dirty(handle, mpd->inode); | ||
1775 | if (err) | ||
1776 | ext4_error(mpd->inode->i_sb, | ||
1777 | "Failed to mark inode %lu dirty", | ||
1778 | mpd->inode->i_ino); | ||
1779 | } | ||
1780 | |||
1781 | submit_io: | ||
1782 | mpage_da_submit_io(mpd, mapp); | ||
1783 | mpd->io_done = 1; | ||
1784 | } | ||
1785 | |||
1786 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ | ||
1787 | (1 << BH_Delay) | (1 << BH_Unwritten)) | ||
1788 | |||
1789 | /* | ||
1790 | * mpage_add_bh_to_extent - try to add one more block to extent of blocks | ||
1791 | * | ||
1792 | * @mpd->lbh - extent of blocks | ||
1793 | * @logical - logical number of the block in the file | ||
1794 | * @b_state - b_state of the buffer head added | ||
1795 | * | ||
1796 | * the function is used to collect contig. blocks in same state | ||
1797 | */ | ||
1798 | static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, | ||
1799 | unsigned long b_state) | ||
1800 | { | ||
1801 | sector_t next; | ||
1802 | int blkbits = mpd->inode->i_blkbits; | ||
1803 | int nrblocks = mpd->b_size >> blkbits; | ||
1804 | |||
1805 | /* | ||
1806 | * XXX Don't go larger than mballoc is willing to allocate | ||
1807 | * This is a stopgap solution. We eventually need to fold | ||
1808 | * mpage_da_submit_io() into this function and then call | ||
1809 | * ext4_map_blocks() multiple times in a loop | ||
1810 | */ | ||
1811 | if (nrblocks >= (8*1024*1024 >> blkbits)) | ||
1812 | goto flush_it; | ||
1813 | |||
1814 | /* check if the reserved journal credits might overflow */ | ||
1815 | if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { | ||
1816 | if (nrblocks >= EXT4_MAX_TRANS_DATA) { | ||
1817 | /* | ||
1818 | * With non-extent format we are limited by the journal | ||
1819 | * credit available. Total credit needed to insert | ||
1820 | * nrblocks contiguous blocks is dependent on the | ||
1821 | * nrblocks. So limit nrblocks. | ||
1822 | */ | ||
1823 | goto flush_it; | ||
1824 | } | ||
1825 | } | ||
1826 | /* | ||
1827 | * First block in the extent | ||
1828 | */ | ||
1829 | if (mpd->b_size == 0) { | ||
1830 | mpd->b_blocknr = logical; | ||
1831 | mpd->b_size = 1 << blkbits; | ||
1832 | mpd->b_state = b_state & BH_FLAGS; | ||
1833 | return; | ||
1834 | } | ||
1835 | |||
1836 | next = mpd->b_blocknr + nrblocks; | ||
1837 | /* | ||
1838 | * Can we merge the block to our big extent? | ||
1839 | */ | ||
1840 | if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { | ||
1841 | mpd->b_size += 1 << blkbits; | ||
1842 | return; | ||
1843 | } | ||
1844 | |||
1845 | flush_it: | ||
1846 | /* | ||
1847 | * We couldn't merge the block to our extent, so we | ||
1848 | * need to flush current extent and start new one | ||
1849 | */ | ||
1850 | mpage_da_map_and_submit(mpd); | ||
1851 | return; | ||
1852 | } | ||
1853 | |||
1854 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) | 1500 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) |
1855 | { | 1501 | { |
1856 | return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); | 1502 | return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); |
@@ -1883,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, | |||
1883 | "logical block %lu\n", inode->i_ino, map->m_len, | 1529 | "logical block %lu\n", inode->i_ino, map->m_len, |
1884 | (unsigned long) map->m_lblk); | 1530 | (unsigned long) map->m_lblk); |
1885 | 1531 | ||
1532 | ext4_es_lru_add(inode); | ||
1533 | |||
1886 | /* Lookup extent status tree firstly */ | 1534 | /* Lookup extent status tree firstly */ |
1887 | if (ext4_es_lookup_extent(inode, iblock, &es)) { | 1535 | if (ext4_es_lookup_extent(inode, iblock, &es)) { |
1888 | 1536 | ||
@@ -2156,7 +1804,7 @@ out: | |||
2156 | * lock so we have to do some magic. | 1804 | * lock so we have to do some magic. |
2157 | * | 1805 | * |
2158 | * This function can get called via... | 1806 | * This function can get called via... |
2159 | * - ext4_da_writepages after taking page lock (have journal handle) | 1807 | * - ext4_writepages after taking page lock (have journal handle) |
2160 | * - journal_submit_inode_data_buffers (no journal handle) | 1808 | * - journal_submit_inode_data_buffers (no journal handle) |
2161 | * - shrink_page_list via the kswapd/direct reclaim (no journal handle) | 1809 | * - shrink_page_list via the kswapd/direct reclaim (no journal handle) |
2162 | * - grab_page_cache when doing write_begin (have journal handle) | 1810 | * - grab_page_cache when doing write_begin (have journal handle) |
@@ -2234,76 +1882,405 @@ static int ext4_writepage(struct page *page, | |||
2234 | */ | 1882 | */ |
2235 | return __ext4_journalled_writepage(page, len); | 1883 | return __ext4_journalled_writepage(page, len); |
2236 | 1884 | ||
2237 | memset(&io_submit, 0, sizeof(io_submit)); | 1885 | ext4_io_submit_init(&io_submit, wbc); |
1886 | io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); | ||
1887 | if (!io_submit.io_end) { | ||
1888 | redirty_page_for_writepage(wbc, page); | ||
1889 | unlock_page(page); | ||
1890 | return -ENOMEM; | ||
1891 | } | ||
2238 | ret = ext4_bio_write_page(&io_submit, page, len, wbc); | 1892 | ret = ext4_bio_write_page(&io_submit, page, len, wbc); |
2239 | ext4_io_submit(&io_submit); | 1893 | ext4_io_submit(&io_submit); |
1894 | /* Drop io_end reference we got from init */ | ||
1895 | ext4_put_io_end_defer(io_submit.io_end); | ||
2240 | return ret; | 1896 | return ret; |
2241 | } | 1897 | } |
2242 | 1898 | ||
1899 | #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) | ||
1900 | |||
2243 | /* | 1901 | /* |
2244 | * This is called via ext4_da_writepages() to | 1902 | * mballoc gives us at most this number of blocks... |
2245 | * calculate the total number of credits to reserve to fit | 1903 | * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). |
2246 | * a single extent allocation into a single transaction, | 1904 | * The rest of mballoc seems to handle chunks upto full group size. |
2247 | * ext4_da_writpeages() will loop calling this before | ||
2248 | * the block allocation. | ||
2249 | */ | 1905 | */ |
1906 | #define MAX_WRITEPAGES_EXTENT_LEN 2048 | ||
2250 | 1907 | ||
2251 | static int ext4_da_writepages_trans_blocks(struct inode *inode) | 1908 | /* |
1909 | * mpage_add_bh_to_extent - try to add bh to extent of blocks to map | ||
1910 | * | ||
1911 | * @mpd - extent of blocks | ||
1912 | * @lblk - logical number of the block in the file | ||
1913 | * @b_state - b_state of the buffer head added | ||
1914 | * | ||
1915 | * the function is used to collect contig. blocks in same state | ||
1916 | */ | ||
1917 | static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, | ||
1918 | unsigned long b_state) | ||
1919 | { | ||
1920 | struct ext4_map_blocks *map = &mpd->map; | ||
1921 | |||
1922 | /* Don't go larger than mballoc is willing to allocate */ | ||
1923 | if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) | ||
1924 | return 0; | ||
1925 | |||
1926 | /* First block in the extent? */ | ||
1927 | if (map->m_len == 0) { | ||
1928 | map->m_lblk = lblk; | ||
1929 | map->m_len = 1; | ||
1930 | map->m_flags = b_state & BH_FLAGS; | ||
1931 | return 1; | ||
1932 | } | ||
1933 | |||
1934 | /* Can we merge the block to our big extent? */ | ||
1935 | if (lblk == map->m_lblk + map->m_len && | ||
1936 | (b_state & BH_FLAGS) == map->m_flags) { | ||
1937 | map->m_len++; | ||
1938 | return 1; | ||
1939 | } | ||
1940 | return 0; | ||
1941 | } | ||
1942 | |||
1943 | static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, | ||
1944 | struct buffer_head *head, | ||
1945 | struct buffer_head *bh, | ||
1946 | ext4_lblk_t lblk) | ||
1947 | { | ||
1948 | struct inode *inode = mpd->inode; | ||
1949 | ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) | ||
1950 | >> inode->i_blkbits; | ||
1951 | |||
1952 | do { | ||
1953 | BUG_ON(buffer_locked(bh)); | ||
1954 | |||
1955 | if (!buffer_dirty(bh) || !buffer_mapped(bh) || | ||
1956 | (!buffer_delay(bh) && !buffer_unwritten(bh)) || | ||
1957 | lblk >= blocks) { | ||
1958 | /* Found extent to map? */ | ||
1959 | if (mpd->map.m_len) | ||
1960 | return false; | ||
1961 | if (lblk >= blocks) | ||
1962 | return true; | ||
1963 | continue; | ||
1964 | } | ||
1965 | if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) | ||
1966 | return false; | ||
1967 | } while (lblk++, (bh = bh->b_this_page) != head); | ||
1968 | return true; | ||
1969 | } | ||
1970 | |||
1971 | static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) | ||
2252 | { | 1972 | { |
2253 | int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 1973 | int len; |
1974 | loff_t size = i_size_read(mpd->inode); | ||
1975 | int err; | ||
1976 | |||
1977 | BUG_ON(page->index != mpd->first_page); | ||
1978 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
1979 | len = size & ~PAGE_CACHE_MASK; | ||
1980 | else | ||
1981 | len = PAGE_CACHE_SIZE; | ||
1982 | clear_page_dirty_for_io(page); | ||
1983 | err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); | ||
1984 | if (!err) | ||
1985 | mpd->wbc->nr_to_write--; | ||
1986 | mpd->first_page++; | ||
2254 | 1987 | ||
1988 | return err; | ||
1989 | } | ||
1990 | |||
1991 | /* | ||
1992 | * mpage_map_buffers - update buffers corresponding to changed extent and | ||
1993 | * submit fully mapped pages for IO | ||
1994 | * | ||
1995 | * @mpd - description of extent to map, on return next extent to map | ||
1996 | * | ||
1997 | * Scan buffers corresponding to changed extent (we expect corresponding pages | ||
1998 | * to be already locked) and update buffer state according to new extent state. | ||
1999 | * We map delalloc buffers to their physical location, clear unwritten bits, | ||
2000 | * and mark buffers as uninit when we perform writes to uninitialized extents | ||
2001 | * and do extent conversion after IO is finished. If the last page is not fully | ||
2002 | * mapped, we update @map to the next extent in the last page that needs | ||
2003 | * mapping. Otherwise we submit the page for IO. | ||
2004 | */ | ||
2005 | static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) | ||
2006 | { | ||
2007 | struct pagevec pvec; | ||
2008 | int nr_pages, i; | ||
2009 | struct inode *inode = mpd->inode; | ||
2010 | struct buffer_head *head, *bh; | ||
2011 | int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; | ||
2012 | ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) | ||
2013 | >> inode->i_blkbits; | ||
2014 | pgoff_t start, end; | ||
2015 | ext4_lblk_t lblk; | ||
2016 | sector_t pblock; | ||
2017 | int err; | ||
2018 | |||
2019 | start = mpd->map.m_lblk >> bpp_bits; | ||
2020 | end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; | ||
2021 | lblk = start << bpp_bits; | ||
2022 | pblock = mpd->map.m_pblk; | ||
2023 | |||
2024 | pagevec_init(&pvec, 0); | ||
2025 | while (start <= end) { | ||
2026 | nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, | ||
2027 | PAGEVEC_SIZE); | ||
2028 | if (nr_pages == 0) | ||
2029 | break; | ||
2030 | for (i = 0; i < nr_pages; i++) { | ||
2031 | struct page *page = pvec.pages[i]; | ||
2032 | |||
2033 | if (page->index > end) | ||
2034 | break; | ||
2035 | /* Upto 'end' pages must be contiguous */ | ||
2036 | BUG_ON(page->index != start); | ||
2037 | bh = head = page_buffers(page); | ||
2038 | do { | ||
2039 | if (lblk < mpd->map.m_lblk) | ||
2040 | continue; | ||
2041 | if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { | ||
2042 | /* | ||
2043 | * Buffer after end of mapped extent. | ||
2044 | * Find next buffer in the page to map. | ||
2045 | */ | ||
2046 | mpd->map.m_len = 0; | ||
2047 | mpd->map.m_flags = 0; | ||
2048 | add_page_bufs_to_extent(mpd, head, bh, | ||
2049 | lblk); | ||
2050 | pagevec_release(&pvec); | ||
2051 | return 0; | ||
2052 | } | ||
2053 | if (buffer_delay(bh)) { | ||
2054 | clear_buffer_delay(bh); | ||
2055 | bh->b_blocknr = pblock++; | ||
2056 | } | ||
2057 | clear_buffer_unwritten(bh); | ||
2058 | } while (++lblk < blocks && | ||
2059 | (bh = bh->b_this_page) != head); | ||
2060 | |||
2061 | /* | ||
2062 | * FIXME: This is going to break if dioread_nolock | ||
2063 | * supports blocksize < pagesize as we will try to | ||
2064 | * convert potentially unmapped parts of inode. | ||
2065 | */ | ||
2066 | mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; | ||
2067 | /* Page fully mapped - let IO run! */ | ||
2068 | err = mpage_submit_page(mpd, page); | ||
2069 | if (err < 0) { | ||
2070 | pagevec_release(&pvec); | ||
2071 | return err; | ||
2072 | } | ||
2073 | start++; | ||
2074 | } | ||
2075 | pagevec_release(&pvec); | ||
2076 | } | ||
2077 | /* Extent fully mapped and matches with page boundary. We are done. */ | ||
2078 | mpd->map.m_len = 0; | ||
2079 | mpd->map.m_flags = 0; | ||
2080 | return 0; | ||
2081 | } | ||
2082 | |||
2083 | static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) | ||
2084 | { | ||
2085 | struct inode *inode = mpd->inode; | ||
2086 | struct ext4_map_blocks *map = &mpd->map; | ||
2087 | int get_blocks_flags; | ||
2088 | int err; | ||
2089 | |||
2090 | trace_ext4_da_write_pages_extent(inode, map); | ||
2255 | /* | 2091 | /* |
2256 | * With non-extent format the journal credit needed to | 2092 | * Call ext4_map_blocks() to allocate any delayed allocation blocks, or |
2257 | * insert nrblocks contiguous block is dependent on | 2093 | * to convert an uninitialized extent to be initialized (in the case |
2258 | * number of contiguous block. So we will limit | 2094 | * where we have written into one or more preallocated blocks). It is |
2259 | * number of contiguous block to a sane value | 2095 | * possible that we're going to need more metadata blocks than |
2096 | * previously reserved. However we must not fail because we're in | ||
2097 | * writeback and there is nothing we can do about it so it might result | ||
2098 | * in data loss. So use reserved blocks to allocate metadata if | ||
2099 | * possible. | ||
2100 | * | ||
2101 | * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks | ||
2102 | * in question are delalloc blocks. This affects functions in many | ||
2103 | * different parts of the allocation call path. This flag exists | ||
2104 | * primarily because we don't want to change *many* call functions, so | ||
2105 | * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag | ||
2106 | * once the inode's allocation semaphore is taken. | ||
2260 | */ | 2107 | */ |
2261 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && | 2108 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE | |
2262 | (max_blocks > EXT4_MAX_TRANS_DATA)) | 2109 | EXT4_GET_BLOCKS_METADATA_NOFAIL; |
2263 | max_blocks = EXT4_MAX_TRANS_DATA; | 2110 | if (ext4_should_dioread_nolock(inode)) |
2111 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; | ||
2112 | if (map->m_flags & (1 << BH_Delay)) | ||
2113 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; | ||
2264 | 2114 | ||
2265 | return ext4_chunk_trans_blocks(inode, max_blocks); | 2115 | err = ext4_map_blocks(handle, inode, map, get_blocks_flags); |
2116 | if (err < 0) | ||
2117 | return err; | ||
2118 | if (map->m_flags & EXT4_MAP_UNINIT) { | ||
2119 | if (!mpd->io_submit.io_end->handle && | ||
2120 | ext4_handle_valid(handle)) { | ||
2121 | mpd->io_submit.io_end->handle = handle->h_rsv_handle; | ||
2122 | handle->h_rsv_handle = NULL; | ||
2123 | } | ||
2124 | ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); | ||
2125 | } | ||
2126 | |||
2127 | BUG_ON(map->m_len == 0); | ||
2128 | if (map->m_flags & EXT4_MAP_NEW) { | ||
2129 | struct block_device *bdev = inode->i_sb->s_bdev; | ||
2130 | int i; | ||
2131 | |||
2132 | for (i = 0; i < map->m_len; i++) | ||
2133 | unmap_underlying_metadata(bdev, map->m_pblk + i); | ||
2134 | } | ||
2135 | return 0; | ||
2266 | } | 2136 | } |
2267 | 2137 | ||
2268 | /* | 2138 | /* |
2269 | * write_cache_pages_da - walk the list of dirty pages of the given | 2139 | * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length |
2270 | * address space and accumulate pages that need writing, and call | 2140 | * mpd->len and submit pages underlying it for IO |
2271 | * mpage_da_map_and_submit to map a single contiguous memory region | 2141 | * |
2272 | * and then write them. | 2142 | * @handle - handle for journal operations |
2143 | * @mpd - extent to map | ||
2144 | * | ||
2145 | * The function maps extent starting at mpd->lblk of length mpd->len. If it is | ||
2146 | * delayed, blocks are allocated, if it is unwritten, we may need to convert | ||
2147 | * them to initialized or split the described range from larger unwritten | ||
2148 | * extent. Note that we need not map all the described range since allocation | ||
2149 | * can return less blocks or the range is covered by more unwritten extents. We | ||
2150 | * cannot map more because we are limited by reserved transaction credits. On | ||
2151 | * the other hand we always make sure that the last touched page is fully | ||
2152 | * mapped so that it can be written out (and thus forward progress is | ||
2153 | * guaranteed). After mapping we submit all mapped pages for IO. | ||
2273 | */ | 2154 | */ |
2274 | static int write_cache_pages_da(handle_t *handle, | 2155 | static int mpage_map_and_submit_extent(handle_t *handle, |
2275 | struct address_space *mapping, | 2156 | struct mpage_da_data *mpd, |
2276 | struct writeback_control *wbc, | 2157 | bool *give_up_on_write) |
2277 | struct mpage_da_data *mpd, | ||
2278 | pgoff_t *done_index) | ||
2279 | { | 2158 | { |
2280 | struct buffer_head *bh, *head; | 2159 | struct inode *inode = mpd->inode; |
2281 | struct inode *inode = mapping->host; | 2160 | struct ext4_map_blocks *map = &mpd->map; |
2282 | struct pagevec pvec; | 2161 | int err; |
2283 | unsigned int nr_pages; | 2162 | loff_t disksize; |
2284 | sector_t logical; | ||
2285 | pgoff_t index, end; | ||
2286 | long nr_to_write = wbc->nr_to_write; | ||
2287 | int i, tag, ret = 0; | ||
2288 | |||
2289 | memset(mpd, 0, sizeof(struct mpage_da_data)); | ||
2290 | mpd->wbc = wbc; | ||
2291 | mpd->inode = inode; | ||
2292 | pagevec_init(&pvec, 0); | ||
2293 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | ||
2294 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | ||
2295 | 2163 | ||
2296 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | 2164 | mpd->io_submit.io_end->offset = |
2165 | ((loff_t)map->m_lblk) << inode->i_blkbits; | ||
2166 | while (map->m_len) { | ||
2167 | err = mpage_map_one_extent(handle, mpd); | ||
2168 | if (err < 0) { | ||
2169 | struct super_block *sb = inode->i_sb; | ||
2170 | |||
2171 | if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) | ||
2172 | goto invalidate_dirty_pages; | ||
2173 | /* | ||
2174 | * Let the uper layers retry transient errors. | ||
2175 | * In the case of ENOSPC, if ext4_count_free_blocks() | ||
2176 | * is non-zero, a commit should free up blocks. | ||
2177 | */ | ||
2178 | if ((err == -ENOMEM) || | ||
2179 | (err == -ENOSPC && ext4_count_free_clusters(sb))) | ||
2180 | return err; | ||
2181 | ext4_msg(sb, KERN_CRIT, | ||
2182 | "Delayed block allocation failed for " | ||
2183 | "inode %lu at logical offset %llu with" | ||
2184 | " max blocks %u with error %d", | ||
2185 | inode->i_ino, | ||
2186 | (unsigned long long)map->m_lblk, | ||
2187 | (unsigned)map->m_len, -err); | ||
2188 | ext4_msg(sb, KERN_CRIT, | ||
2189 | "This should not happen!! Data will " | ||
2190 | "be lost\n"); | ||
2191 | if (err == -ENOSPC) | ||
2192 | ext4_print_free_blocks(inode); | ||
2193 | invalidate_dirty_pages: | ||
2194 | *give_up_on_write = true; | ||
2195 | return err; | ||
2196 | } | ||
2197 | /* | ||
2198 | * Update buffer state, submit mapped pages, and get us new | ||
2199 | * extent to map | ||
2200 | */ | ||
2201 | err = mpage_map_and_submit_buffers(mpd); | ||
2202 | if (err < 0) | ||
2203 | return err; | ||
2204 | } | ||
2205 | |||
2206 | /* Update on-disk size after IO is submitted */ | ||
2207 | disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; | ||
2208 | if (disksize > i_size_read(inode)) | ||
2209 | disksize = i_size_read(inode); | ||
2210 | if (disksize > EXT4_I(inode)->i_disksize) { | ||
2211 | int err2; | ||
2212 | |||
2213 | ext4_update_i_disksize(inode, disksize); | ||
2214 | err2 = ext4_mark_inode_dirty(handle, inode); | ||
2215 | if (err2) | ||
2216 | ext4_error(inode->i_sb, | ||
2217 | "Failed to mark inode %lu dirty", | ||
2218 | inode->i_ino); | ||
2219 | if (!err) | ||
2220 | err = err2; | ||
2221 | } | ||
2222 | return err; | ||
2223 | } | ||
2224 | |||
2225 | /* | ||
2226 | * Calculate the total number of credits to reserve for one writepages | ||
2227 | * iteration. This is called from ext4_writepages(). We map an extent of | ||
2228 | * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping | ||
2229 | * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + | ||
2230 | * bpp - 1 blocks in bpp different extents. | ||
2231 | */ | ||
2232 | static int ext4_da_writepages_trans_blocks(struct inode *inode) | ||
2233 | { | ||
2234 | int bpp = ext4_journal_blocks_per_page(inode); | ||
2235 | |||
2236 | return ext4_meta_trans_blocks(inode, | ||
2237 | MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); | ||
2238 | } | ||
2239 | |||
2240 | /* | ||
2241 | * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages | ||
2242 | * and underlying extent to map | ||
2243 | * | ||
2244 | * @mpd - where to look for pages | ||
2245 | * | ||
2246 | * Walk dirty pages in the mapping. If they are fully mapped, submit them for | ||
2247 | * IO immediately. When we find a page which isn't mapped we start accumulating | ||
2248 | * extent of buffers underlying these pages that needs mapping (formed by | ||
2249 | * either delayed or unwritten buffers). We also lock the pages containing | ||
2250 | * these buffers. The extent found is returned in @mpd structure (starting at | ||
2251 | * mpd->lblk with length mpd->len blocks). | ||
2252 | * | ||
2253 | * Note that this function can attach bios to one io_end structure which are | ||
2254 | * neither logically nor physically contiguous. Although it may seem as an | ||
2255 | * unnecessary complication, it is actually inevitable in blocksize < pagesize | ||
2256 | * case as we need to track IO to all buffers underlying a page in one io_end. | ||
2257 | */ | ||
2258 | static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) | ||
2259 | { | ||
2260 | struct address_space *mapping = mpd->inode->i_mapping; | ||
2261 | struct pagevec pvec; | ||
2262 | unsigned int nr_pages; | ||
2263 | pgoff_t index = mpd->first_page; | ||
2264 | pgoff_t end = mpd->last_page; | ||
2265 | int tag; | ||
2266 | int i, err = 0; | ||
2267 | int blkbits = mpd->inode->i_blkbits; | ||
2268 | ext4_lblk_t lblk; | ||
2269 | struct buffer_head *head; | ||
2270 | |||
2271 | if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) | ||
2297 | tag = PAGECACHE_TAG_TOWRITE; | 2272 | tag = PAGECACHE_TAG_TOWRITE; |
2298 | else | 2273 | else |
2299 | tag = PAGECACHE_TAG_DIRTY; | 2274 | tag = PAGECACHE_TAG_DIRTY; |
2300 | 2275 | ||
2301 | *done_index = index; | 2276 | pagevec_init(&pvec, 0); |
2277 | mpd->map.m_len = 0; | ||
2278 | mpd->next_page = index; | ||
2302 | while (index <= end) { | 2279 | while (index <= end) { |
2303 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, | 2280 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, |
2304 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | 2281 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); |
2305 | if (nr_pages == 0) | 2282 | if (nr_pages == 0) |
2306 | return 0; | 2283 | goto out; |
2307 | 2284 | ||
2308 | for (i = 0; i < nr_pages; i++) { | 2285 | for (i = 0; i < nr_pages; i++) { |
2309 | struct page *page = pvec.pages[i]; | 2286 | struct page *page = pvec.pages[i]; |
@@ -2318,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle, | |||
2318 | if (page->index > end) | 2295 | if (page->index > end) |
2319 | goto out; | 2296 | goto out; |
2320 | 2297 | ||
2321 | *done_index = page->index + 1; | 2298 | /* If we can't merge this page, we are done. */ |
2322 | 2299 | if (mpd->map.m_len > 0 && mpd->next_page != page->index) | |
2323 | /* | 2300 | goto out; |
2324 | * If we can't merge this page, and we have | ||
2325 | * accumulated an contiguous region, write it | ||
2326 | */ | ||
2327 | if ((mpd->next_page != page->index) && | ||
2328 | (mpd->next_page != mpd->first_page)) { | ||
2329 | mpage_da_map_and_submit(mpd); | ||
2330 | goto ret_extent_tail; | ||
2331 | } | ||
2332 | 2301 | ||
2333 | lock_page(page); | 2302 | lock_page(page); |
2334 | |||
2335 | /* | 2303 | /* |
2336 | * If the page is no longer dirty, or its | 2304 | * If the page is no longer dirty, or its mapping no |
2337 | * mapping no longer corresponds to inode we | 2305 | * longer corresponds to inode we are writing (which |
2338 | * are writing (which means it has been | 2306 | * means it has been truncated or invalidated), or the |
2339 | * truncated or invalidated), or the page is | 2307 | * page is already under writeback and we are not doing |
2340 | * already under writeback and we are not | 2308 | * a data integrity writeback, skip the page |
2341 | * doing a data integrity writeback, skip the page | ||
2342 | */ | 2309 | */ |
2343 | if (!PageDirty(page) || | 2310 | if (!PageDirty(page) || |
2344 | (PageWriteback(page) && | 2311 | (PageWriteback(page) && |
2345 | (wbc->sync_mode == WB_SYNC_NONE)) || | 2312 | (mpd->wbc->sync_mode == WB_SYNC_NONE)) || |
2346 | unlikely(page->mapping != mapping)) { | 2313 | unlikely(page->mapping != mapping)) { |
2347 | unlock_page(page); | 2314 | unlock_page(page); |
2348 | continue; | 2315 | continue; |
@@ -2351,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle, | |||
2351 | wait_on_page_writeback(page); | 2318 | wait_on_page_writeback(page); |
2352 | BUG_ON(PageWriteback(page)); | 2319 | BUG_ON(PageWriteback(page)); |
2353 | 2320 | ||
2354 | /* | 2321 | if (mpd->map.m_len == 0) |
2355 | * If we have inline data and arrive here, it means that | ||
2356 | * we will soon create the block for the 1st page, so | ||
2357 | * we'd better clear the inline data here. | ||
2358 | */ | ||
2359 | if (ext4_has_inline_data(inode)) { | ||
2360 | BUG_ON(ext4_test_inode_state(inode, | ||
2361 | EXT4_STATE_MAY_INLINE_DATA)); | ||
2362 | ext4_destroy_inline_data(handle, inode); | ||
2363 | } | ||
2364 | |||
2365 | if (mpd->next_page != page->index) | ||
2366 | mpd->first_page = page->index; | 2322 | mpd->first_page = page->index; |
2367 | mpd->next_page = page->index + 1; | 2323 | mpd->next_page = page->index + 1; |
2368 | logical = (sector_t) page->index << | ||
2369 | (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
2370 | |||
2371 | /* Add all dirty buffers to mpd */ | 2324 | /* Add all dirty buffers to mpd */ |
2325 | lblk = ((ext4_lblk_t)page->index) << | ||
2326 | (PAGE_CACHE_SHIFT - blkbits); | ||
2372 | head = page_buffers(page); | 2327 | head = page_buffers(page); |
2373 | bh = head; | 2328 | if (!add_page_bufs_to_extent(mpd, head, head, lblk)) |
2374 | do { | 2329 | goto out; |
2375 | BUG_ON(buffer_locked(bh)); | 2330 | /* So far everything mapped? Submit the page for IO. */ |
2376 | /* | 2331 | if (mpd->map.m_len == 0) { |
2377 | * We need to try to allocate unmapped blocks | 2332 | err = mpage_submit_page(mpd, page); |
2378 | * in the same page. Otherwise we won't make | 2333 | if (err < 0) |
2379 | * progress with the page in ext4_writepage | ||
2380 | */ | ||
2381 | if (ext4_bh_delay_or_unwritten(NULL, bh)) { | ||
2382 | mpage_add_bh_to_extent(mpd, logical, | ||
2383 | bh->b_state); | ||
2384 | if (mpd->io_done) | ||
2385 | goto ret_extent_tail; | ||
2386 | } else if (buffer_dirty(bh) && | ||
2387 | buffer_mapped(bh)) { | ||
2388 | /* | ||
2389 | * mapped dirty buffer. We need to | ||
2390 | * update the b_state because we look | ||
2391 | * at b_state in mpage_da_map_blocks. | ||
2392 | * We don't update b_size because if we | ||
2393 | * find an unmapped buffer_head later | ||
2394 | * we need to use the b_state flag of | ||
2395 | * that buffer_head. | ||
2396 | */ | ||
2397 | if (mpd->b_size == 0) | ||
2398 | mpd->b_state = | ||
2399 | bh->b_state & BH_FLAGS; | ||
2400 | } | ||
2401 | logical++; | ||
2402 | } while ((bh = bh->b_this_page) != head); | ||
2403 | |||
2404 | if (nr_to_write > 0) { | ||
2405 | nr_to_write--; | ||
2406 | if (nr_to_write == 0 && | ||
2407 | wbc->sync_mode == WB_SYNC_NONE) | ||
2408 | /* | ||
2409 | * We stop writing back only if we are | ||
2410 | * not doing integrity sync. In case of | ||
2411 | * integrity sync we have to keep going | ||
2412 | * because someone may be concurrently | ||
2413 | * dirtying pages, and we might have | ||
2414 | * synced a lot of newly appeared dirty | ||
2415 | * pages, but have not synced all of the | ||
2416 | * old dirty pages. | ||
2417 | */ | ||
2418 | goto out; | 2334 | goto out; |
2419 | } | 2335 | } |
2336 | |||
2337 | /* | ||
2338 | * Accumulated enough dirty pages? This doesn't apply | ||
2339 | * to WB_SYNC_ALL mode. For integrity sync we have to | ||
2340 | * keep going because someone may be concurrently | ||
2341 | * dirtying pages, and we might have synced a lot of | ||
2342 | * newly appeared dirty pages, but have not synced all | ||
2343 | * of the old dirty pages. | ||
2344 | */ | ||
2345 | if (mpd->wbc->sync_mode == WB_SYNC_NONE && | ||
2346 | mpd->next_page - mpd->first_page >= | ||
2347 | mpd->wbc->nr_to_write) | ||
2348 | goto out; | ||
2420 | } | 2349 | } |
2421 | pagevec_release(&pvec); | 2350 | pagevec_release(&pvec); |
2422 | cond_resched(); | 2351 | cond_resched(); |
2423 | } | 2352 | } |
2424 | return 0; | 2353 | return 0; |
2425 | ret_extent_tail: | ||
2426 | ret = MPAGE_DA_EXTENT_TAIL; | ||
2427 | out: | 2354 | out: |
2428 | pagevec_release(&pvec); | 2355 | pagevec_release(&pvec); |
2429 | cond_resched(); | 2356 | return err; |
2430 | return ret; | ||
2431 | } | 2357 | } |
2432 | 2358 | ||
2359 | static int __writepage(struct page *page, struct writeback_control *wbc, | ||
2360 | void *data) | ||
2361 | { | ||
2362 | struct address_space *mapping = data; | ||
2363 | int ret = ext4_writepage(page, wbc); | ||
2364 | mapping_set_error(mapping, ret); | ||
2365 | return ret; | ||
2366 | } | ||
2433 | 2367 | ||
2434 | static int ext4_da_writepages(struct address_space *mapping, | 2368 | static int ext4_writepages(struct address_space *mapping, |
2435 | struct writeback_control *wbc) | 2369 | struct writeback_control *wbc) |
2436 | { | 2370 | { |
2437 | pgoff_t index; | 2371 | pgoff_t writeback_index = 0; |
2372 | long nr_to_write = wbc->nr_to_write; | ||
2438 | int range_whole = 0; | 2373 | int range_whole = 0; |
2374 | int cycled = 1; | ||
2439 | handle_t *handle = NULL; | 2375 | handle_t *handle = NULL; |
2440 | struct mpage_da_data mpd; | 2376 | struct mpage_da_data mpd; |
2441 | struct inode *inode = mapping->host; | 2377 | struct inode *inode = mapping->host; |
2442 | int pages_written = 0; | 2378 | int needed_blocks, rsv_blocks = 0, ret = 0; |
2443 | unsigned int max_pages; | ||
2444 | int range_cyclic, cycled = 1, io_done = 0; | ||
2445 | int needed_blocks, ret = 0; | ||
2446 | long desired_nr_to_write, nr_to_writebump = 0; | ||
2447 | loff_t range_start = wbc->range_start; | ||
2448 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | 2379 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
2449 | pgoff_t done_index = 0; | 2380 | bool done; |
2450 | pgoff_t end; | ||
2451 | struct blk_plug plug; | 2381 | struct blk_plug plug; |
2382 | bool give_up_on_write = false; | ||
2452 | 2383 | ||
2453 | trace_ext4_da_writepages(inode, wbc); | 2384 | trace_ext4_writepages(inode, wbc); |
2454 | 2385 | ||
2455 | /* | 2386 | /* |
2456 | * No pages to write? This is mainly a kludge to avoid starting | 2387 | * No pages to write? This is mainly a kludge to avoid starting |
@@ -2460,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2460 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) | 2391 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) |
2461 | return 0; | 2392 | return 0; |
2462 | 2393 | ||
2394 | if (ext4_should_journal_data(inode)) { | ||
2395 | struct blk_plug plug; | ||
2396 | int ret; | ||
2397 | |||
2398 | blk_start_plug(&plug); | ||
2399 | ret = write_cache_pages(mapping, wbc, __writepage, mapping); | ||
2400 | blk_finish_plug(&plug); | ||
2401 | return ret; | ||
2402 | } | ||
2403 | |||
2463 | /* | 2404 | /* |
2464 | * If the filesystem has aborted, it is read-only, so return | 2405 | * If the filesystem has aborted, it is read-only, so return |
2465 | * right away instead of dumping stack traces later on that | 2406 | * right away instead of dumping stack traces later on that |
2466 | * will obscure the real source of the problem. We test | 2407 | * will obscure the real source of the problem. We test |
2467 | * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because | 2408 | * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because |
2468 | * the latter could be true if the filesystem is mounted | 2409 | * the latter could be true if the filesystem is mounted |
2469 | * read-only, and in that case, ext4_da_writepages should | 2410 | * read-only, and in that case, ext4_writepages should |
2470 | * *never* be called, so if that ever happens, we would want | 2411 | * *never* be called, so if that ever happens, we would want |
2471 | * the stack trace. | 2412 | * the stack trace. |
2472 | */ | 2413 | */ |
2473 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) | 2414 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) |
2474 | return -EROFS; | 2415 | return -EROFS; |
2475 | 2416 | ||
2417 | if (ext4_should_dioread_nolock(inode)) { | ||
2418 | /* | ||
2419 | * We may need to convert upto one extent per block in | ||
2420 | * the page and we may dirty the inode. | ||
2421 | */ | ||
2422 | rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); | ||
2423 | } | ||
2424 | |||
2425 | /* | ||
2426 | * If we have inline data and arrive here, it means that | ||
2427 | * we will soon create the block for the 1st page, so | ||
2428 | * we'd better clear the inline data here. | ||
2429 | */ | ||
2430 | if (ext4_has_inline_data(inode)) { | ||
2431 | /* Just inode will be modified... */ | ||
2432 | handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); | ||
2433 | if (IS_ERR(handle)) { | ||
2434 | ret = PTR_ERR(handle); | ||
2435 | goto out_writepages; | ||
2436 | } | ||
2437 | BUG_ON(ext4_test_inode_state(inode, | ||
2438 | EXT4_STATE_MAY_INLINE_DATA)); | ||
2439 | ext4_destroy_inline_data(handle, inode); | ||
2440 | ext4_journal_stop(handle); | ||
2441 | } | ||
2442 | |||
2476 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 2443 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
2477 | range_whole = 1; | 2444 | range_whole = 1; |
2478 | 2445 | ||
2479 | range_cyclic = wbc->range_cyclic; | ||
2480 | if (wbc->range_cyclic) { | 2446 | if (wbc->range_cyclic) { |
2481 | index = mapping->writeback_index; | 2447 | writeback_index = mapping->writeback_index; |
2482 | if (index) | 2448 | if (writeback_index) |
2483 | cycled = 0; | 2449 | cycled = 0; |
2484 | wbc->range_start = index << PAGE_CACHE_SHIFT; | 2450 | mpd.first_page = writeback_index; |
2485 | wbc->range_end = LLONG_MAX; | 2451 | mpd.last_page = -1; |
2486 | wbc->range_cyclic = 0; | ||
2487 | end = -1; | ||
2488 | } else { | 2452 | } else { |
2489 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2453 | mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; |
2490 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 2454 | mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; |
2491 | } | ||
2492 | |||
2493 | /* | ||
2494 | * This works around two forms of stupidity. The first is in | ||
2495 | * the writeback code, which caps the maximum number of pages | ||
2496 | * written to be 1024 pages. This is wrong on multiple | ||
2497 | * levels; different architectues have a different page size, | ||
2498 | * which changes the maximum amount of data which gets | ||
2499 | * written. Secondly, 4 megabytes is way too small. XFS | ||
2500 | * forces this value to be 16 megabytes by multiplying | ||
2501 | * nr_to_write parameter by four, and then relies on its | ||
2502 | * allocator to allocate larger extents to make them | ||
2503 | * contiguous. Unfortunately this brings us to the second | ||
2504 | * stupidity, which is that ext4's mballoc code only allocates | ||
2505 | * at most 2048 blocks. So we force contiguous writes up to | ||
2506 | * the number of dirty blocks in the inode, or | ||
2507 | * sbi->max_writeback_mb_bump whichever is smaller. | ||
2508 | */ | ||
2509 | max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); | ||
2510 | if (!range_cyclic && range_whole) { | ||
2511 | if (wbc->nr_to_write == LONG_MAX) | ||
2512 | desired_nr_to_write = wbc->nr_to_write; | ||
2513 | else | ||
2514 | desired_nr_to_write = wbc->nr_to_write * 8; | ||
2515 | } else | ||
2516 | desired_nr_to_write = ext4_num_dirty_pages(inode, index, | ||
2517 | max_pages); | ||
2518 | if (desired_nr_to_write > max_pages) | ||
2519 | desired_nr_to_write = max_pages; | ||
2520 | |||
2521 | if (wbc->nr_to_write < desired_nr_to_write) { | ||
2522 | nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; | ||
2523 | wbc->nr_to_write = desired_nr_to_write; | ||
2524 | } | 2455 | } |
2525 | 2456 | ||
2457 | mpd.inode = inode; | ||
2458 | mpd.wbc = wbc; | ||
2459 | ext4_io_submit_init(&mpd.io_submit, wbc); | ||
2526 | retry: | 2460 | retry: |
2527 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | 2461 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2528 | tag_pages_for_writeback(mapping, index, end); | 2462 | tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); |
2529 | 2463 | done = false; | |
2530 | blk_start_plug(&plug); | 2464 | blk_start_plug(&plug); |
2531 | while (!ret && wbc->nr_to_write > 0) { | 2465 | while (!done && mpd.first_page <= mpd.last_page) { |
2466 | /* For each extent of pages we use new io_end */ | ||
2467 | mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); | ||
2468 | if (!mpd.io_submit.io_end) { | ||
2469 | ret = -ENOMEM; | ||
2470 | break; | ||
2471 | } | ||
2532 | 2472 | ||
2533 | /* | 2473 | /* |
2534 | * we insert one extent at a time. So we need | 2474 | * We have two constraints: We find one extent to map and we |
2535 | * credit needed for single extent allocation. | 2475 | * must always write out whole page (makes a difference when |
2536 | * journalled mode is currently not supported | 2476 | * blocksize < pagesize) so that we don't block on IO when we |
2537 | * by delalloc | 2477 | * try to write out the rest of the page. Journalled mode is |
2478 | * not supported by delalloc. | ||
2538 | */ | 2479 | */ |
2539 | BUG_ON(ext4_should_journal_data(inode)); | 2480 | BUG_ON(ext4_should_journal_data(inode)); |
2540 | needed_blocks = ext4_da_writepages_trans_blocks(inode); | 2481 | needed_blocks = ext4_da_writepages_trans_blocks(inode); |
2541 | 2482 | ||
2542 | /* start a new transaction*/ | 2483 | /* start a new transaction */ |
2543 | handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, | 2484 | handle = ext4_journal_start_with_reserve(inode, |
2544 | needed_blocks); | 2485 | EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); |
2545 | if (IS_ERR(handle)) { | 2486 | if (IS_ERR(handle)) { |
2546 | ret = PTR_ERR(handle); | 2487 | ret = PTR_ERR(handle); |
2547 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " | 2488 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " |
2548 | "%ld pages, ino %lu; err %d", __func__, | 2489 | "%ld pages, ino %lu; err %d", __func__, |
2549 | wbc->nr_to_write, inode->i_ino, ret); | 2490 | wbc->nr_to_write, inode->i_ino, ret); |
2550 | blk_finish_plug(&plug); | 2491 | /* Release allocated io_end */ |
2551 | goto out_writepages; | 2492 | ext4_put_io_end(mpd.io_submit.io_end); |
2493 | break; | ||
2552 | } | 2494 | } |
2553 | 2495 | ||
2554 | /* | 2496 | trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); |
2555 | * Now call write_cache_pages_da() to find the next | 2497 | ret = mpage_prepare_extent_to_map(&mpd); |
2556 | * contiguous region of logical blocks that need | 2498 | if (!ret) { |
2557 | * blocks to be allocated by ext4 and submit them. | 2499 | if (mpd.map.m_len) |
2558 | */ | 2500 | ret = mpage_map_and_submit_extent(handle, &mpd, |
2559 | ret = write_cache_pages_da(handle, mapping, | 2501 | &give_up_on_write); |
2560 | wbc, &mpd, &done_index); | 2502 | else { |
2561 | /* | 2503 | /* |
2562 | * If we have a contiguous extent of pages and we | 2504 | * We scanned the whole range (or exhausted |
2563 | * haven't done the I/O yet, map the blocks and submit | 2505 | * nr_to_write), submitted what was mapped and |
2564 | * them for I/O. | 2506 | * didn't find anything needing mapping. We are |
2565 | */ | 2507 | * done. |
2566 | if (!mpd.io_done && mpd.next_page != mpd.first_page) { | 2508 | */ |
2567 | mpage_da_map_and_submit(&mpd); | 2509 | done = true; |
2568 | ret = MPAGE_DA_EXTENT_TAIL; | 2510 | } |
2569 | } | 2511 | } |
2570 | trace_ext4_da_write_pages(inode, &mpd); | ||
2571 | wbc->nr_to_write -= mpd.pages_written; | ||
2572 | |||
2573 | ext4_journal_stop(handle); | 2512 | ext4_journal_stop(handle); |
2574 | 2513 | /* Submit prepared bio */ | |
2575 | if ((mpd.retval == -ENOSPC) && sbi->s_journal) { | 2514 | ext4_io_submit(&mpd.io_submit); |
2576 | /* commit the transaction which would | 2515 | /* Unlock pages we didn't use */ |
2516 | mpage_release_unused_pages(&mpd, give_up_on_write); | ||
2517 | /* Drop our io_end reference we got from init */ | ||
2518 | ext4_put_io_end(mpd.io_submit.io_end); | ||
2519 | |||
2520 | if (ret == -ENOSPC && sbi->s_journal) { | ||
2521 | /* | ||
2522 | * Commit the transaction which would | ||
2577 | * free blocks released in the transaction | 2523 | * free blocks released in the transaction |
2578 | * and try again | 2524 | * and try again |
2579 | */ | 2525 | */ |
2580 | jbd2_journal_force_commit_nested(sbi->s_journal); | 2526 | jbd2_journal_force_commit_nested(sbi->s_journal); |
2581 | ret = 0; | 2527 | ret = 0; |
2582 | } else if (ret == MPAGE_DA_EXTENT_TAIL) { | 2528 | continue; |
2583 | /* | 2529 | } |
2584 | * Got one extent now try with rest of the pages. | 2530 | /* Fatal error - ENOMEM, EIO... */ |
2585 | * If mpd.retval is set -EIO, journal is aborted. | 2531 | if (ret) |
2586 | * So we don't need to write any more. | ||
2587 | */ | ||
2588 | pages_written += mpd.pages_written; | ||
2589 | ret = mpd.retval; | ||
2590 | io_done = 1; | ||
2591 | } else if (wbc->nr_to_write) | ||
2592 | /* | ||
2593 | * There is no more writeout needed | ||
2594 | * or we requested for a noblocking writeout | ||
2595 | * and we found the device congested | ||
2596 | */ | ||
2597 | break; | 2532 | break; |
2598 | } | 2533 | } |
2599 | blk_finish_plug(&plug); | 2534 | blk_finish_plug(&plug); |
2600 | if (!io_done && !cycled) { | 2535 | if (!ret && !cycled) { |
2601 | cycled = 1; | 2536 | cycled = 1; |
2602 | index = 0; | 2537 | mpd.last_page = writeback_index - 1; |
2603 | wbc->range_start = index << PAGE_CACHE_SHIFT; | 2538 | mpd.first_page = 0; |
2604 | wbc->range_end = mapping->writeback_index - 1; | ||
2605 | goto retry; | 2539 | goto retry; |
2606 | } | 2540 | } |
2607 | 2541 | ||
2608 | /* Update index */ | 2542 | /* Update index */ |
2609 | wbc->range_cyclic = range_cyclic; | ||
2610 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 2543 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
2611 | /* | 2544 | /* |
2612 | * set the writeback_index so that range_cyclic | 2545 | * Set the writeback_index so that range_cyclic |
2613 | * mode will write it back later | 2546 | * mode will write it back later |
2614 | */ | 2547 | */ |
2615 | mapping->writeback_index = done_index; | 2548 | mapping->writeback_index = mpd.first_page; |
2616 | 2549 | ||
2617 | out_writepages: | 2550 | out_writepages: |
2618 | wbc->nr_to_write -= nr_to_writebump; | 2551 | trace_ext4_writepages_result(inode, wbc, ret, |
2619 | wbc->range_start = range_start; | 2552 | nr_to_write - wbc->nr_to_write); |
2620 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); | ||
2621 | return ret; | 2553 | return ret; |
2622 | } | 2554 | } |
2623 | 2555 | ||
@@ -2829,7 +2761,8 @@ static int ext4_da_write_end(struct file *file, | |||
2829 | return ret ? ret : copied; | 2761 | return ret ? ret : copied; |
2830 | } | 2762 | } |
2831 | 2763 | ||
2832 | static void ext4_da_invalidatepage(struct page *page, unsigned long offset) | 2764 | static void ext4_da_invalidatepage(struct page *page, unsigned int offset, |
2765 | unsigned int length) | ||
2833 | { | 2766 | { |
2834 | /* | 2767 | /* |
2835 | * Drop reserved blocks | 2768 | * Drop reserved blocks |
@@ -2838,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset) | |||
2838 | if (!page_has_buffers(page)) | 2771 | if (!page_has_buffers(page)) |
2839 | goto out; | 2772 | goto out; |
2840 | 2773 | ||
2841 | ext4_da_page_release_reservation(page, offset); | 2774 | ext4_da_page_release_reservation(page, offset, length); |
2842 | 2775 | ||
2843 | out: | 2776 | out: |
2844 | ext4_invalidatepage(page, offset); | 2777 | ext4_invalidatepage(page, offset, length); |
2845 | 2778 | ||
2846 | return; | 2779 | return; |
2847 | } | 2780 | } |
@@ -2864,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode) | |||
2864 | * laptop_mode, not even desirable). However, to do otherwise | 2797 | * laptop_mode, not even desirable). However, to do otherwise |
2865 | * would require replicating code paths in: | 2798 | * would require replicating code paths in: |
2866 | * | 2799 | * |
2867 | * ext4_da_writepages() -> | 2800 | * ext4_writepages() -> |
2868 | * write_cache_pages() ---> (via passed in callback function) | 2801 | * write_cache_pages() ---> (via passed in callback function) |
2869 | * __mpage_da_writepage() --> | 2802 | * __mpage_da_writepage() --> |
2870 | * mpage_add_bh_to_extent() | 2803 | * mpage_add_bh_to_extent() |
@@ -2989,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping, | |||
2989 | return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); | 2922 | return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); |
2990 | } | 2923 | } |
2991 | 2924 | ||
2992 | static void ext4_invalidatepage(struct page *page, unsigned long offset) | 2925 | static void ext4_invalidatepage(struct page *page, unsigned int offset, |
2926 | unsigned int length) | ||
2993 | { | 2927 | { |
2994 | trace_ext4_invalidatepage(page, offset); | 2928 | trace_ext4_invalidatepage(page, offset, length); |
2995 | 2929 | ||
2996 | /* No journalling happens on data buffers when this function is used */ | 2930 | /* No journalling happens on data buffers when this function is used */ |
2997 | WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); | 2931 | WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); |
2998 | 2932 | ||
2999 | block_invalidatepage(page, offset); | 2933 | block_invalidatepage(page, offset, length); |
3000 | } | 2934 | } |
3001 | 2935 | ||
3002 | static int __ext4_journalled_invalidatepage(struct page *page, | 2936 | static int __ext4_journalled_invalidatepage(struct page *page, |
3003 | unsigned long offset) | 2937 | unsigned int offset, |
2938 | unsigned int length) | ||
3004 | { | 2939 | { |
3005 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); | 2940 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); |
3006 | 2941 | ||
3007 | trace_ext4_journalled_invalidatepage(page, offset); | 2942 | trace_ext4_journalled_invalidatepage(page, offset, length); |
3008 | 2943 | ||
3009 | /* | 2944 | /* |
3010 | * If it's a full truncate we just forget about the pending dirtying | 2945 | * If it's a full truncate we just forget about the pending dirtying |
3011 | */ | 2946 | */ |
3012 | if (offset == 0) | 2947 | if (offset == 0 && length == PAGE_CACHE_SIZE) |
3013 | ClearPageChecked(page); | 2948 | ClearPageChecked(page); |
3014 | 2949 | ||
3015 | return jbd2_journal_invalidatepage(journal, page, offset); | 2950 | return jbd2_journal_invalidatepage(journal, page, offset, length); |
3016 | } | 2951 | } |
3017 | 2952 | ||
3018 | /* Wrapper for aops... */ | 2953 | /* Wrapper for aops... */ |
3019 | static void ext4_journalled_invalidatepage(struct page *page, | 2954 | static void ext4_journalled_invalidatepage(struct page *page, |
3020 | unsigned long offset) | 2955 | unsigned int offset, |
2956 | unsigned int length) | ||
3021 | { | 2957 | { |
3022 | WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); | 2958 | WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); |
3023 | } | 2959 | } |
3024 | 2960 | ||
3025 | static int ext4_releasepage(struct page *page, gfp_t wait) | 2961 | static int ext4_releasepage(struct page *page, gfp_t wait) |
@@ -3067,9 +3003,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3067 | struct inode *inode = file_inode(iocb->ki_filp); | 3003 | struct inode *inode = file_inode(iocb->ki_filp); |
3068 | ext4_io_end_t *io_end = iocb->private; | 3004 | ext4_io_end_t *io_end = iocb->private; |
3069 | 3005 | ||
3070 | /* if not async direct IO or dio with 0 bytes write, just return */ | 3006 | /* if not async direct IO just return */ |
3071 | if (!io_end || !size) | 3007 | if (!io_end) { |
3072 | goto out; | 3008 | inode_dio_done(inode); |
3009 | if (is_async) | ||
3010 | aio_complete(iocb, ret, 0); | ||
3011 | return; | ||
3012 | } | ||
3073 | 3013 | ||
3074 | ext_debug("ext4_end_io_dio(): io_end 0x%p " | 3014 | ext_debug("ext4_end_io_dio(): io_end 0x%p " |
3075 | "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", | 3015 | "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", |
@@ -3077,25 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3077 | size); | 3017 | size); |
3078 | 3018 | ||
3079 | iocb->private = NULL; | 3019 | iocb->private = NULL; |
3080 | |||
3081 | /* if not aio dio with unwritten extents, just free io and return */ | ||
3082 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | ||
3083 | ext4_free_io_end(io_end); | ||
3084 | out: | ||
3085 | inode_dio_done(inode); | ||
3086 | if (is_async) | ||
3087 | aio_complete(iocb, ret, 0); | ||
3088 | return; | ||
3089 | } | ||
3090 | |||
3091 | io_end->offset = offset; | 3020 | io_end->offset = offset; |
3092 | io_end->size = size; | 3021 | io_end->size = size; |
3093 | if (is_async) { | 3022 | if (is_async) { |
3094 | io_end->iocb = iocb; | 3023 | io_end->iocb = iocb; |
3095 | io_end->result = ret; | 3024 | io_end->result = ret; |
3096 | } | 3025 | } |
3097 | 3026 | ext4_put_io_end_defer(io_end); | |
3098 | ext4_add_complete_io(io_end); | ||
3099 | } | 3027 | } |
3100 | 3028 | ||
3101 | /* | 3029 | /* |
@@ -3129,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3129 | get_block_t *get_block_func = NULL; | 3057 | get_block_t *get_block_func = NULL; |
3130 | int dio_flags = 0; | 3058 | int dio_flags = 0; |
3131 | loff_t final_size = offset + count; | 3059 | loff_t final_size = offset + count; |
3060 | ext4_io_end_t *io_end = NULL; | ||
3132 | 3061 | ||
3133 | /* Use the old path for reads and writes beyond i_size. */ | 3062 | /* Use the old path for reads and writes beyond i_size. */ |
3134 | if (rw != WRITE || final_size > inode->i_size) | 3063 | if (rw != WRITE || final_size > inode->i_size) |
@@ -3136,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3136 | 3065 | ||
3137 | BUG_ON(iocb->private == NULL); | 3066 | BUG_ON(iocb->private == NULL); |
3138 | 3067 | ||
3068 | /* | ||
3069 | * Make all waiters for direct IO properly wait also for extent | ||
3070 | * conversion. This also disallows race between truncate() and | ||
3071 | * overwrite DIO as i_dio_count needs to be incremented under i_mutex. | ||
3072 | */ | ||
3073 | if (rw == WRITE) | ||
3074 | atomic_inc(&inode->i_dio_count); | ||
3075 | |||
3139 | /* If we do a overwrite dio, i_mutex locking can be released */ | 3076 | /* If we do a overwrite dio, i_mutex locking can be released */ |
3140 | overwrite = *((int *)iocb->private); | 3077 | overwrite = *((int *)iocb->private); |
3141 | 3078 | ||
3142 | if (overwrite) { | 3079 | if (overwrite) { |
3143 | atomic_inc(&inode->i_dio_count); | ||
3144 | down_read(&EXT4_I(inode)->i_data_sem); | 3080 | down_read(&EXT4_I(inode)->i_data_sem); |
3145 | mutex_unlock(&inode->i_mutex); | 3081 | mutex_unlock(&inode->i_mutex); |
3146 | } | 3082 | } |
@@ -3167,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3167 | iocb->private = NULL; | 3103 | iocb->private = NULL; |
3168 | ext4_inode_aio_set(inode, NULL); | 3104 | ext4_inode_aio_set(inode, NULL); |
3169 | if (!is_sync_kiocb(iocb)) { | 3105 | if (!is_sync_kiocb(iocb)) { |
3170 | ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); | 3106 | io_end = ext4_init_io_end(inode, GFP_NOFS); |
3171 | if (!io_end) { | 3107 | if (!io_end) { |
3172 | ret = -ENOMEM; | 3108 | ret = -ENOMEM; |
3173 | goto retake_lock; | 3109 | goto retake_lock; |
3174 | } | 3110 | } |
3175 | io_end->flag |= EXT4_IO_END_DIRECT; | 3111 | io_end->flag |= EXT4_IO_END_DIRECT; |
3176 | iocb->private = io_end; | 3112 | /* |
3113 | * Grab reference for DIO. Will be dropped in ext4_end_io_dio() | ||
3114 | */ | ||
3115 | iocb->private = ext4_get_io_end(io_end); | ||
3177 | /* | 3116 | /* |
3178 | * we save the io structure for current async direct | 3117 | * we save the io structure for current async direct |
3179 | * IO, so that later ext4_map_blocks() could flag the | 3118 | * IO, so that later ext4_map_blocks() could flag the |
@@ -3197,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3197 | NULL, | 3136 | NULL, |
3198 | dio_flags); | 3137 | dio_flags); |
3199 | 3138 | ||
3200 | if (iocb->private) | ||
3201 | ext4_inode_aio_set(inode, NULL); | ||
3202 | /* | 3139 | /* |
3203 | * The io_end structure takes a reference to the inode, that | 3140 | * Put our reference to io_end. This can free the io_end structure e.g. |
3204 | * structure needs to be destroyed and the reference to the | 3141 | * in sync IO case or in case of error. It can even perform extent |
3205 | * inode need to be dropped, when IO is complete, even with 0 | 3142 | * conversion if all bios we submitted finished before we got here. |
3206 | * byte write, or failed. | 3143 | * Note that in that case iocb->private can be already set to NULL |
3207 | * | 3144 | * here. |
3208 | * In the successful AIO DIO case, the io_end structure will | ||
3209 | * be destroyed and the reference to the inode will be dropped | ||
3210 | * after the end_io call back function is called. | ||
3211 | * | ||
3212 | * In the case there is 0 byte write, or error case, since VFS | ||
3213 | * direct IO won't invoke the end_io call back function, we | ||
3214 | * need to free the end_io structure here. | ||
3215 | */ | 3145 | */ |
3216 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | 3146 | if (io_end) { |
3217 | ext4_free_io_end(iocb->private); | 3147 | ext4_inode_aio_set(inode, NULL); |
3218 | iocb->private = NULL; | 3148 | ext4_put_io_end(io_end); |
3219 | } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, | 3149 | /* |
3150 | * When no IO was submitted ext4_end_io_dio() was not | ||
3151 | * called so we have to put iocb's reference. | ||
3152 | */ | ||
3153 | if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { | ||
3154 | WARN_ON(iocb->private != io_end); | ||
3155 | WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); | ||
3156 | WARN_ON(io_end->iocb); | ||
3157 | /* | ||
3158 | * Generic code already did inode_dio_done() so we | ||
3159 | * have to clear EXT4_IO_END_DIRECT to not do it for | ||
3160 | * the second time. | ||
3161 | */ | ||
3162 | io_end->flag = 0; | ||
3163 | ext4_put_io_end(io_end); | ||
3164 | iocb->private = NULL; | ||
3165 | } | ||
3166 | } | ||
3167 | if (ret > 0 && !overwrite && ext4_test_inode_state(inode, | ||
3220 | EXT4_STATE_DIO_UNWRITTEN)) { | 3168 | EXT4_STATE_DIO_UNWRITTEN)) { |
3221 | int err; | 3169 | int err; |
3222 | /* | 3170 | /* |
3223 | * for non AIO case, since the IO is already | 3171 | * for non AIO case, since the IO is already |
3224 | * completed, we could do the conversion right here | 3172 | * completed, we could do the conversion right here |
3225 | */ | 3173 | */ |
3226 | err = ext4_convert_unwritten_extents(inode, | 3174 | err = ext4_convert_unwritten_extents(NULL, inode, |
3227 | offset, ret); | 3175 | offset, ret); |
3228 | if (err < 0) | 3176 | if (err < 0) |
3229 | ret = err; | 3177 | ret = err; |
@@ -3231,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3231 | } | 3179 | } |
3232 | 3180 | ||
3233 | retake_lock: | 3181 | retake_lock: |
3182 | if (rw == WRITE) | ||
3183 | inode_dio_done(inode); | ||
3234 | /* take i_mutex locking again if we do a ovewrite dio */ | 3184 | /* take i_mutex locking again if we do a ovewrite dio */ |
3235 | if (overwrite) { | 3185 | if (overwrite) { |
3236 | inode_dio_done(inode); | ||
3237 | up_read(&EXT4_I(inode)->i_data_sem); | 3186 | up_read(&EXT4_I(inode)->i_data_sem); |
3238 | mutex_lock(&inode->i_mutex); | 3187 | mutex_lock(&inode->i_mutex); |
3239 | } | 3188 | } |
@@ -3292,6 +3241,7 @@ static const struct address_space_operations ext4_aops = { | |||
3292 | .readpage = ext4_readpage, | 3241 | .readpage = ext4_readpage, |
3293 | .readpages = ext4_readpages, | 3242 | .readpages = ext4_readpages, |
3294 | .writepage = ext4_writepage, | 3243 | .writepage = ext4_writepage, |
3244 | .writepages = ext4_writepages, | ||
3295 | .write_begin = ext4_write_begin, | 3245 | .write_begin = ext4_write_begin, |
3296 | .write_end = ext4_write_end, | 3246 | .write_end = ext4_write_end, |
3297 | .bmap = ext4_bmap, | 3247 | .bmap = ext4_bmap, |
@@ -3307,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = { | |||
3307 | .readpage = ext4_readpage, | 3257 | .readpage = ext4_readpage, |
3308 | .readpages = ext4_readpages, | 3258 | .readpages = ext4_readpages, |
3309 | .writepage = ext4_writepage, | 3259 | .writepage = ext4_writepage, |
3260 | .writepages = ext4_writepages, | ||
3310 | .write_begin = ext4_write_begin, | 3261 | .write_begin = ext4_write_begin, |
3311 | .write_end = ext4_journalled_write_end, | 3262 | .write_end = ext4_journalled_write_end, |
3312 | .set_page_dirty = ext4_journalled_set_page_dirty, | 3263 | .set_page_dirty = ext4_journalled_set_page_dirty, |
@@ -3322,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = { | |||
3322 | .readpage = ext4_readpage, | 3273 | .readpage = ext4_readpage, |
3323 | .readpages = ext4_readpages, | 3274 | .readpages = ext4_readpages, |
3324 | .writepage = ext4_writepage, | 3275 | .writepage = ext4_writepage, |
3325 | .writepages = ext4_da_writepages, | 3276 | .writepages = ext4_writepages, |
3326 | .write_begin = ext4_da_write_begin, | 3277 | .write_begin = ext4_da_write_begin, |
3327 | .write_end = ext4_da_write_end, | 3278 | .write_end = ext4_da_write_end, |
3328 | .bmap = ext4_bmap, | 3279 | .bmap = ext4_bmap, |
@@ -3355,89 +3306,56 @@ void ext4_set_aops(struct inode *inode) | |||
3355 | inode->i_mapping->a_ops = &ext4_aops; | 3306 | inode->i_mapping->a_ops = &ext4_aops; |
3356 | } | 3307 | } |
3357 | 3308 | ||
3358 | |||
3359 | /* | 3309 | /* |
3360 | * ext4_discard_partial_page_buffers() | 3310 | * ext4_block_truncate_page() zeroes out a mapping from file offset `from' |
3361 | * Wrapper function for ext4_discard_partial_page_buffers_no_lock. | 3311 | * up to the end of the block which corresponds to `from'. |
3362 | * This function finds and locks the page containing the offset | 3312 | * This required during truncate. We need to physically zero the tail end |
3363 | * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. | 3313 | * of that block so it doesn't yield old data if the file is later grown. |
3364 | * Calling functions that already have the page locked should call | ||
3365 | * ext4_discard_partial_page_buffers_no_lock directly. | ||
3366 | */ | 3314 | */ |
3367 | int ext4_discard_partial_page_buffers(handle_t *handle, | 3315 | int ext4_block_truncate_page(handle_t *handle, |
3368 | struct address_space *mapping, loff_t from, | 3316 | struct address_space *mapping, loff_t from) |
3369 | loff_t length, int flags) | ||
3370 | { | 3317 | { |
3318 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | ||
3319 | unsigned length; | ||
3320 | unsigned blocksize; | ||
3371 | struct inode *inode = mapping->host; | 3321 | struct inode *inode = mapping->host; |
3372 | struct page *page; | ||
3373 | int err = 0; | ||
3374 | 3322 | ||
3375 | page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, | 3323 | blocksize = inode->i_sb->s_blocksize; |
3376 | mapping_gfp_mask(mapping) & ~__GFP_FS); | 3324 | length = blocksize - (offset & (blocksize - 1)); |
3377 | if (!page) | ||
3378 | return -ENOMEM; | ||
3379 | |||
3380 | err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, | ||
3381 | from, length, flags); | ||
3382 | 3325 | ||
3383 | unlock_page(page); | 3326 | return ext4_block_zero_page_range(handle, mapping, from, length); |
3384 | page_cache_release(page); | ||
3385 | return err; | ||
3386 | } | 3327 | } |
3387 | 3328 | ||
3388 | /* | 3329 | /* |
3389 | * ext4_discard_partial_page_buffers_no_lock() | 3330 | * ext4_block_zero_page_range() zeros out a mapping of length 'length' |
3390 | * Zeros a page range of length 'length' starting from offset 'from'. | 3331 | * starting from file offset 'from'. The range to be zero'd must |
3391 | * Buffer heads that correspond to the block aligned regions of the | 3332 | * be contained with in one block. If the specified range exceeds |
3392 | * zeroed range will be unmapped. Unblock aligned regions | 3333 | * the end of the block it will be shortened to end of the block |
3393 | * will have the corresponding buffer head mapped if needed so that | 3334 | * that cooresponds to 'from' |
3394 | * that region of the page can be updated with the partial zero out. | ||
3395 | * | ||
3396 | * This function assumes that the page has already been locked. The | ||
3397 | * The range to be discarded must be contained with in the given page. | ||
3398 | * If the specified range exceeds the end of the page it will be shortened | ||
3399 | * to the end of the page that corresponds to 'from'. This function is | ||
3400 | * appropriate for updating a page and it buffer heads to be unmapped and | ||
3401 | * zeroed for blocks that have been either released, or are going to be | ||
3402 | * released. | ||
3403 | * | ||
3404 | * handle: The journal handle | ||
3405 | * inode: The files inode | ||
3406 | * page: A locked page that contains the offset "from" | ||
3407 | * from: The starting byte offset (from the beginning of the file) | ||
3408 | * to begin discarding | ||
3409 | * len: The length of bytes to discard | ||
3410 | * flags: Optional flags that may be used: | ||
3411 | * | ||
3412 | * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED | ||
3413 | * Only zero the regions of the page whose buffer heads | ||
3414 | * have already been unmapped. This flag is appropriate | ||
3415 | * for updating the contents of a page whose blocks may | ||
3416 | * have already been released, and we only want to zero | ||
3417 | * out the regions that correspond to those released blocks. | ||
3418 | * | ||
3419 | * Returns zero on success or negative on failure. | ||
3420 | */ | 3335 | */ |
3421 | static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, | 3336 | int ext4_block_zero_page_range(handle_t *handle, |
3422 | struct inode *inode, struct page *page, loff_t from, | 3337 | struct address_space *mapping, loff_t from, loff_t length) |
3423 | loff_t length, int flags) | ||
3424 | { | 3338 | { |
3425 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; | 3339 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; |
3426 | unsigned int offset = from & (PAGE_CACHE_SIZE-1); | 3340 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
3427 | unsigned int blocksize, max, pos; | 3341 | unsigned blocksize, max, pos; |
3428 | ext4_lblk_t iblock; | 3342 | ext4_lblk_t iblock; |
3343 | struct inode *inode = mapping->host; | ||
3429 | struct buffer_head *bh; | 3344 | struct buffer_head *bh; |
3345 | struct page *page; | ||
3430 | int err = 0; | 3346 | int err = 0; |
3431 | 3347 | ||
3432 | blocksize = inode->i_sb->s_blocksize; | 3348 | page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, |
3433 | max = PAGE_CACHE_SIZE - offset; | 3349 | mapping_gfp_mask(mapping) & ~__GFP_FS); |
3350 | if (!page) | ||
3351 | return -ENOMEM; | ||
3434 | 3352 | ||
3435 | if (index != page->index) | 3353 | blocksize = inode->i_sb->s_blocksize; |
3436 | return -EINVAL; | 3354 | max = blocksize - (offset & (blocksize - 1)); |
3437 | 3355 | ||
3438 | /* | 3356 | /* |
3439 | * correct length if it does not fall between | 3357 | * correct length if it does not fall between |
3440 | * 'from' and the end of the page | 3358 | * 'from' and the end of the block |
3441 | */ | 3359 | */ |
3442 | if (length > max || length < 0) | 3360 | if (length > max || length < 0) |
3443 | length = max; | 3361 | length = max; |
@@ -3455,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, | |||
3455 | iblock++; | 3373 | iblock++; |
3456 | pos += blocksize; | 3374 | pos += blocksize; |
3457 | } | 3375 | } |
3458 | 3376 | if (buffer_freed(bh)) { | |
3459 | pos = offset; | 3377 | BUFFER_TRACE(bh, "freed: skip"); |
3460 | while (pos < offset + length) { | 3378 | goto unlock; |
3461 | unsigned int end_of_block, range_to_discard; | 3379 | } |
3462 | 3380 | if (!buffer_mapped(bh)) { | |
3463 | err = 0; | 3381 | BUFFER_TRACE(bh, "unmapped"); |
3464 | 3382 | ext4_get_block(inode, iblock, bh, 0); | |
3465 | /* The length of space left to zero and unmap */ | 3383 | /* unmapped? It's a hole - nothing to do */ |
3466 | range_to_discard = offset + length - pos; | ||
3467 | |||
3468 | /* The length of space until the end of the block */ | ||
3469 | end_of_block = blocksize - (pos & (blocksize-1)); | ||
3470 | |||
3471 | /* | ||
3472 | * Do not unmap or zero past end of block | ||
3473 | * for this buffer head | ||
3474 | */ | ||
3475 | if (range_to_discard > end_of_block) | ||
3476 | range_to_discard = end_of_block; | ||
3477 | |||
3478 | |||
3479 | /* | ||
3480 | * Skip this buffer head if we are only zeroing unampped | ||
3481 | * regions of the page | ||
3482 | */ | ||
3483 | if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && | ||
3484 | buffer_mapped(bh)) | ||
3485 | goto next; | ||
3486 | |||
3487 | /* If the range is block aligned, unmap */ | ||
3488 | if (range_to_discard == blocksize) { | ||
3489 | clear_buffer_dirty(bh); | ||
3490 | bh->b_bdev = NULL; | ||
3491 | clear_buffer_mapped(bh); | ||
3492 | clear_buffer_req(bh); | ||
3493 | clear_buffer_new(bh); | ||
3494 | clear_buffer_delay(bh); | ||
3495 | clear_buffer_unwritten(bh); | ||
3496 | clear_buffer_uptodate(bh); | ||
3497 | zero_user(page, pos, range_to_discard); | ||
3498 | BUFFER_TRACE(bh, "Buffer discarded"); | ||
3499 | goto next; | ||
3500 | } | ||
3501 | |||
3502 | /* | ||
3503 | * If this block is not completely contained in the range | ||
3504 | * to be discarded, then it is not going to be released. Because | ||
3505 | * we need to keep this block, we need to make sure this part | ||
3506 | * of the page is uptodate before we modify it by writeing | ||
3507 | * partial zeros on it. | ||
3508 | */ | ||
3509 | if (!buffer_mapped(bh)) { | 3384 | if (!buffer_mapped(bh)) { |
3510 | /* | 3385 | BUFFER_TRACE(bh, "still unmapped"); |
3511 | * Buffer head must be mapped before we can read | 3386 | goto unlock; |
3512 | * from the block | ||
3513 | */ | ||
3514 | BUFFER_TRACE(bh, "unmapped"); | ||
3515 | ext4_get_block(inode, iblock, bh, 0); | ||
3516 | /* unmapped? It's a hole - nothing to do */ | ||
3517 | if (!buffer_mapped(bh)) { | ||
3518 | BUFFER_TRACE(bh, "still unmapped"); | ||
3519 | goto next; | ||
3520 | } | ||
3521 | } | 3387 | } |
3388 | } | ||
3522 | 3389 | ||
3523 | /* Ok, it's mapped. Make sure it's up-to-date */ | 3390 | /* Ok, it's mapped. Make sure it's up-to-date */ |
3524 | if (PageUptodate(page)) | 3391 | if (PageUptodate(page)) |
3525 | set_buffer_uptodate(bh); | 3392 | set_buffer_uptodate(bh); |
3526 | 3393 | ||
3527 | if (!buffer_uptodate(bh)) { | 3394 | if (!buffer_uptodate(bh)) { |
3528 | err = -EIO; | 3395 | err = -EIO; |
3529 | ll_rw_block(READ, 1, &bh); | 3396 | ll_rw_block(READ, 1, &bh); |
3530 | wait_on_buffer(bh); | 3397 | wait_on_buffer(bh); |
3531 | /* Uhhuh. Read error. Complain and punt.*/ | 3398 | /* Uhhuh. Read error. Complain and punt. */ |
3532 | if (!buffer_uptodate(bh)) | 3399 | if (!buffer_uptodate(bh)) |
3533 | goto next; | 3400 | goto unlock; |
3534 | } | 3401 | } |
3402 | if (ext4_should_journal_data(inode)) { | ||
3403 | BUFFER_TRACE(bh, "get write access"); | ||
3404 | err = ext4_journal_get_write_access(handle, bh); | ||
3405 | if (err) | ||
3406 | goto unlock; | ||
3407 | } | ||
3408 | zero_user(page, offset, length); | ||
3409 | BUFFER_TRACE(bh, "zeroed end of block"); | ||
3535 | 3410 | ||
3536 | if (ext4_should_journal_data(inode)) { | 3411 | if (ext4_should_journal_data(inode)) { |
3537 | BUFFER_TRACE(bh, "get write access"); | 3412 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
3538 | err = ext4_journal_get_write_access(handle, bh); | 3413 | } else { |
3539 | if (err) | 3414 | err = 0; |
3540 | goto next; | 3415 | mark_buffer_dirty(bh); |
3541 | } | 3416 | if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) |
3417 | err = ext4_jbd2_file_inode(handle, inode); | ||
3418 | } | ||
3419 | |||
3420 | unlock: | ||
3421 | unlock_page(page); | ||
3422 | page_cache_release(page); | ||
3423 | return err; | ||
3424 | } | ||
3542 | 3425 | ||
3543 | zero_user(page, pos, range_to_discard); | 3426 | int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, |
3427 | loff_t lstart, loff_t length) | ||
3428 | { | ||
3429 | struct super_block *sb = inode->i_sb; | ||
3430 | struct address_space *mapping = inode->i_mapping; | ||
3431 | unsigned partial_start, partial_end; | ||
3432 | ext4_fsblk_t start, end; | ||
3433 | loff_t byte_end = (lstart + length - 1); | ||
3434 | int err = 0; | ||
3544 | 3435 | ||
3545 | err = 0; | 3436 | partial_start = lstart & (sb->s_blocksize - 1); |
3546 | if (ext4_should_journal_data(inode)) { | 3437 | partial_end = byte_end & (sb->s_blocksize - 1); |
3547 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
3548 | } else | ||
3549 | mark_buffer_dirty(bh); | ||
3550 | 3438 | ||
3551 | BUFFER_TRACE(bh, "Partial buffer zeroed"); | 3439 | start = lstart >> sb->s_blocksize_bits; |
3552 | next: | 3440 | end = byte_end >> sb->s_blocksize_bits; |
3553 | bh = bh->b_this_page; | ||
3554 | iblock++; | ||
3555 | pos += range_to_discard; | ||
3556 | } | ||
3557 | 3441 | ||
3442 | /* Handle partial zero within the single block */ | ||
3443 | if (start == end && | ||
3444 | (partial_start || (partial_end != sb->s_blocksize - 1))) { | ||
3445 | err = ext4_block_zero_page_range(handle, mapping, | ||
3446 | lstart, length); | ||
3447 | return err; | ||
3448 | } | ||
3449 | /* Handle partial zero out on the start of the range */ | ||
3450 | if (partial_start) { | ||
3451 | err = ext4_block_zero_page_range(handle, mapping, | ||
3452 | lstart, sb->s_blocksize); | ||
3453 | if (err) | ||
3454 | return err; | ||
3455 | } | ||
3456 | /* Handle partial zero out on the end of the range */ | ||
3457 | if (partial_end != sb->s_blocksize - 1) | ||
3458 | err = ext4_block_zero_page_range(handle, mapping, | ||
3459 | byte_end - partial_end, | ||
3460 | partial_end + 1); | ||
3558 | return err; | 3461 | return err; |
3559 | } | 3462 | } |
3560 | 3463 | ||
@@ -3580,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode) | |||
3580 | * Returns: 0 on success or negative on failure | 3483 | * Returns: 0 on success or negative on failure |
3581 | */ | 3484 | */ |
3582 | 3485 | ||
3583 | int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | 3486 | int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) |
3584 | { | 3487 | { |
3585 | struct inode *inode = file_inode(file); | ||
3586 | struct super_block *sb = inode->i_sb; | 3488 | struct super_block *sb = inode->i_sb; |
3587 | ext4_lblk_t first_block, stop_block; | 3489 | ext4_lblk_t first_block, stop_block; |
3588 | struct address_space *mapping = inode->i_mapping; | 3490 | struct address_space *mapping = inode->i_mapping; |
3589 | loff_t first_page, last_page, page_len; | 3491 | loff_t first_block_offset, last_block_offset; |
3590 | loff_t first_page_offset, last_page_offset; | ||
3591 | handle_t *handle; | 3492 | handle_t *handle; |
3592 | unsigned int credits; | 3493 | unsigned int credits; |
3593 | int ret = 0; | 3494 | int ret = 0; |
@@ -3638,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
3638 | offset; | 3539 | offset; |
3639 | } | 3540 | } |
3640 | 3541 | ||
3641 | first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 3542 | first_block_offset = round_up(offset, sb->s_blocksize); |
3642 | last_page = (offset + length) >> PAGE_CACHE_SHIFT; | 3543 | last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; |
3643 | 3544 | ||
3644 | first_page_offset = first_page << PAGE_CACHE_SHIFT; | 3545 | /* Now release the pages and zero block aligned part of pages*/ |
3645 | last_page_offset = last_page << PAGE_CACHE_SHIFT; | 3546 | if (last_block_offset > first_block_offset) |
3646 | 3547 | truncate_pagecache_range(inode, first_block_offset, | |
3647 | /* Now release the pages */ | 3548 | last_block_offset); |
3648 | if (last_page_offset > first_page_offset) { | ||
3649 | truncate_pagecache_range(inode, first_page_offset, | ||
3650 | last_page_offset - 1); | ||
3651 | } | ||
3652 | 3549 | ||
3653 | /* Wait all existing dio workers, newcomers will block on i_mutex */ | 3550 | /* Wait all existing dio workers, newcomers will block on i_mutex */ |
3654 | ext4_inode_block_unlocked_dio(inode); | 3551 | ext4_inode_block_unlocked_dio(inode); |
3655 | ret = ext4_flush_unwritten_io(inode); | ||
3656 | if (ret) | ||
3657 | goto out_dio; | ||
3658 | inode_dio_wait(inode); | 3552 | inode_dio_wait(inode); |
3659 | 3553 | ||
3660 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3554 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
@@ -3668,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
3668 | goto out_dio; | 3562 | goto out_dio; |
3669 | } | 3563 | } |
3670 | 3564 | ||
3671 | /* | 3565 | ret = ext4_zero_partial_blocks(handle, inode, offset, |
3672 | * Now we need to zero out the non-page-aligned data in the | 3566 | length); |
3673 | * pages at the start and tail of the hole, and unmap the | 3567 | if (ret) |
3674 | * buffer heads for the block aligned regions of the page that | 3568 | goto out_stop; |
3675 | * were completely zeroed. | ||
3676 | */ | ||
3677 | if (first_page > last_page) { | ||
3678 | /* | ||
3679 | * If the file space being truncated is contained | ||
3680 | * within a page just zero out and unmap the middle of | ||
3681 | * that page | ||
3682 | */ | ||
3683 | ret = ext4_discard_partial_page_buffers(handle, | ||
3684 | mapping, offset, length, 0); | ||
3685 | |||
3686 | if (ret) | ||
3687 | goto out_stop; | ||
3688 | } else { | ||
3689 | /* | ||
3690 | * zero out and unmap the partial page that contains | ||
3691 | * the start of the hole | ||
3692 | */ | ||
3693 | page_len = first_page_offset - offset; | ||
3694 | if (page_len > 0) { | ||
3695 | ret = ext4_discard_partial_page_buffers(handle, mapping, | ||
3696 | offset, page_len, 0); | ||
3697 | if (ret) | ||
3698 | goto out_stop; | ||
3699 | } | ||
3700 | |||
3701 | /* | ||
3702 | * zero out and unmap the partial page that contains | ||
3703 | * the end of the hole | ||
3704 | */ | ||
3705 | page_len = offset + length - last_page_offset; | ||
3706 | if (page_len > 0) { | ||
3707 | ret = ext4_discard_partial_page_buffers(handle, mapping, | ||
3708 | last_page_offset, page_len, 0); | ||
3709 | if (ret) | ||
3710 | goto out_stop; | ||
3711 | } | ||
3712 | } | ||
3713 | |||
3714 | /* | ||
3715 | * If i_size is contained in the last page, we need to | ||
3716 | * unmap and zero the partial page after i_size | ||
3717 | */ | ||
3718 | if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && | ||
3719 | inode->i_size % PAGE_CACHE_SIZE != 0) { | ||
3720 | page_len = PAGE_CACHE_SIZE - | ||
3721 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
3722 | |||
3723 | if (page_len > 0) { | ||
3724 | ret = ext4_discard_partial_page_buffers(handle, | ||
3725 | mapping, inode->i_size, page_len, 0); | ||
3726 | |||
3727 | if (ret) | ||
3728 | goto out_stop; | ||
3729 | } | ||
3730 | } | ||
3731 | 3569 | ||
3732 | first_block = (offset + sb->s_blocksize - 1) >> | 3570 | first_block = (offset + sb->s_blocksize - 1) >> |
3733 | EXT4_BLOCK_SIZE_BITS(sb); | 3571 | EXT4_BLOCK_SIZE_BITS(sb); |
@@ -3803,7 +3641,6 @@ void ext4_truncate(struct inode *inode) | |||
3803 | unsigned int credits; | 3641 | unsigned int credits; |
3804 | handle_t *handle; | 3642 | handle_t *handle; |
3805 | struct address_space *mapping = inode->i_mapping; | 3643 | struct address_space *mapping = inode->i_mapping; |
3806 | loff_t page_len; | ||
3807 | 3644 | ||
3808 | /* | 3645 | /* |
3809 | * There is a possibility that we're either freeing the inode | 3646 | * There is a possibility that we're either freeing the inode |
@@ -3830,12 +3667,6 @@ void ext4_truncate(struct inode *inode) | |||
3830 | return; | 3667 | return; |
3831 | } | 3668 | } |
3832 | 3669 | ||
3833 | /* | ||
3834 | * finish any pending end_io work so we won't run the risk of | ||
3835 | * converting any truncated blocks to initialized later | ||
3836 | */ | ||
3837 | ext4_flush_unwritten_io(inode); | ||
3838 | |||
3839 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3670 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
3840 | credits = ext4_writepage_trans_blocks(inode); | 3671 | credits = ext4_writepage_trans_blocks(inode); |
3841 | else | 3672 | else |
@@ -3847,14 +3678,8 @@ void ext4_truncate(struct inode *inode) | |||
3847 | return; | 3678 | return; |
3848 | } | 3679 | } |
3849 | 3680 | ||
3850 | if (inode->i_size % PAGE_CACHE_SIZE != 0) { | 3681 | if (inode->i_size & (inode->i_sb->s_blocksize - 1)) |
3851 | page_len = PAGE_CACHE_SIZE - | 3682 | ext4_block_truncate_page(handle, mapping, inode->i_size); |
3852 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
3853 | |||
3854 | if (ext4_discard_partial_page_buffers(handle, | ||
3855 | mapping, inode->i_size, page_len, 0)) | ||
3856 | goto out_stop; | ||
3857 | } | ||
3858 | 3683 | ||
3859 | /* | 3684 | /* |
3860 | * We add the inode to the orphan list, so that if this | 3685 | * We add the inode to the orphan list, so that if this |
@@ -4623,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) | |||
4623 | inode->i_size >> PAGE_CACHE_SHIFT); | 4448 | inode->i_size >> PAGE_CACHE_SHIFT); |
4624 | if (!page) | 4449 | if (!page) |
4625 | return; | 4450 | return; |
4626 | ret = __ext4_journalled_invalidatepage(page, offset); | 4451 | ret = __ext4_journalled_invalidatepage(page, offset, |
4452 | PAGE_CACHE_SIZE - offset); | ||
4627 | unlock_page(page); | 4453 | unlock_page(page); |
4628 | page_cache_release(page); | 4454 | page_cache_release(page); |
4629 | if (ret != -EBUSY) | 4455 | if (ret != -EBUSY) |
@@ -4805,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
4805 | struct kstat *stat) | 4631 | struct kstat *stat) |
4806 | { | 4632 | { |
4807 | struct inode *inode; | 4633 | struct inode *inode; |
4808 | unsigned long delalloc_blocks; | 4634 | unsigned long long delalloc_blocks; |
4809 | 4635 | ||
4810 | inode = dentry->d_inode; | 4636 | inode = dentry->d_inode; |
4811 | generic_fillattr(inode, stat); | 4637 | generic_fillattr(inode, stat); |
@@ -4823,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
4823 | delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), | 4649 | delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), |
4824 | EXT4_I(inode)->i_reserved_data_blocks); | 4650 | EXT4_I(inode)->i_reserved_data_blocks); |
4825 | 4651 | ||
4826 | stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; | 4652 | stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9); |
4827 | return 0; | 4653 | return 0; |
4828 | } | 4654 | } |
4829 | 4655 | ||
4830 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 4656 | static int ext4_index_trans_blocks(struct inode *inode, int lblocks, |
4657 | int pextents) | ||
4831 | { | 4658 | { |
4832 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 4659 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
4833 | return ext4_ind_trans_blocks(inode, nrblocks, chunk); | 4660 | return ext4_ind_trans_blocks(inode, lblocks); |
4834 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); | 4661 | return ext4_ext_index_trans_blocks(inode, pextents); |
4835 | } | 4662 | } |
4836 | 4663 | ||
4837 | /* | 4664 | /* |
@@ -4845,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
4845 | * | 4672 | * |
4846 | * Also account for superblock, inode, quota and xattr blocks | 4673 | * Also account for superblock, inode, quota and xattr blocks |
4847 | */ | 4674 | */ |
4848 | static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 4675 | static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, |
4676 | int pextents) | ||
4849 | { | 4677 | { |
4850 | ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); | 4678 | ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); |
4851 | int gdpblocks; | 4679 | int gdpblocks; |
@@ -4853,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
4853 | int ret = 0; | 4681 | int ret = 0; |
4854 | 4682 | ||
4855 | /* | 4683 | /* |
4856 | * How many index blocks need to touch to modify nrblocks? | 4684 | * How many index blocks need to touch to map @lblocks logical blocks |
4857 | * The "Chunk" flag indicating whether the nrblocks is | 4685 | * to @pextents physical extents? |
4858 | * physically contiguous on disk | ||
4859 | * | ||
4860 | * For Direct IO and fallocate, they calls get_block to allocate | ||
4861 | * one single extent at a time, so they could set the "Chunk" flag | ||
4862 | */ | 4686 | */ |
4863 | idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); | 4687 | idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); |
4864 | 4688 | ||
4865 | ret = idxblocks; | 4689 | ret = idxblocks; |
4866 | 4690 | ||
@@ -4868,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
4868 | * Now let's see how many group bitmaps and group descriptors need | 4692 | * Now let's see how many group bitmaps and group descriptors need |
4869 | * to account | 4693 | * to account |
4870 | */ | 4694 | */ |
4871 | groups = idxblocks; | 4695 | groups = idxblocks + pextents; |
4872 | if (chunk) | ||
4873 | groups += 1; | ||
4874 | else | ||
4875 | groups += nrblocks; | ||
4876 | |||
4877 | gdpblocks = groups; | 4696 | gdpblocks = groups; |
4878 | if (groups > ngroups) | 4697 | if (groups > ngroups) |
4879 | groups = ngroups; | 4698 | groups = ngroups; |
@@ -4904,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) | |||
4904 | int bpp = ext4_journal_blocks_per_page(inode); | 4723 | int bpp = ext4_journal_blocks_per_page(inode); |
4905 | int ret; | 4724 | int ret; |
4906 | 4725 | ||
4907 | ret = ext4_meta_trans_blocks(inode, bpp, 0); | 4726 | ret = ext4_meta_trans_blocks(inode, bpp, bpp); |
4908 | 4727 | ||
4909 | /* Account for data blocks for journalled mode */ | 4728 | /* Account for data blocks for journalled mode */ |
4910 | if (ext4_should_journal_data(inode)) | 4729 | if (ext4_should_journal_data(inode)) |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index def84082a9a9..a9ff5e5137ca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -2105,6 +2105,7 @@ repeat: | |||
2105 | group = ac->ac_g_ex.fe_group; | 2105 | group = ac->ac_g_ex.fe_group; |
2106 | 2106 | ||
2107 | for (i = 0; i < ngroups; group++, i++) { | 2107 | for (i = 0; i < ngroups; group++, i++) { |
2108 | cond_resched(); | ||
2108 | /* | 2109 | /* |
2109 | * Artificially restricted ngroups for non-extent | 2110 | * Artificially restricted ngroups for non-extent |
2110 | * files makes group > ngroups possible on first loop. | 2111 | * files makes group > ngroups possible on first loop. |
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |||
4405 | repeat: | 4406 | repeat: |
4406 | /* allocate space in core */ | 4407 | /* allocate space in core */ |
4407 | *errp = ext4_mb_regular_allocator(ac); | 4408 | *errp = ext4_mb_regular_allocator(ac); |
4408 | if (*errp) { | 4409 | if (*errp) |
4409 | ext4_discard_allocated_blocks(ac); | 4410 | goto discard_and_exit; |
4410 | goto errout; | ||
4411 | } | ||
4412 | 4411 | ||
4413 | /* as we've just preallocated more space than | 4412 | /* as we've just preallocated more space than |
4414 | * user requested orinally, we store allocated | 4413 | * user requested originally, we store allocated |
4415 | * space in a special descriptor */ | 4414 | * space in a special descriptor */ |
4416 | if (ac->ac_status == AC_STATUS_FOUND && | 4415 | if (ac->ac_status == AC_STATUS_FOUND && |
4417 | ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) | 4416 | ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) |
4418 | ext4_mb_new_preallocation(ac); | 4417 | *errp = ext4_mb_new_preallocation(ac); |
4418 | if (*errp) { | ||
4419 | discard_and_exit: | ||
4420 | ext4_discard_allocated_blocks(ac); | ||
4421 | goto errout; | ||
4422 | } | ||
4419 | } | 4423 | } |
4420 | if (likely(ac->ac_status == AC_STATUS_FOUND)) { | 4424 | if (likely(ac->ac_status == AC_STATUS_FOUND)) { |
4421 | *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); | 4425 | *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); |
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, | |||
4612 | BUG_ON(bh && (count > 1)); | 4616 | BUG_ON(bh && (count > 1)); |
4613 | 4617 | ||
4614 | for (i = 0; i < count; i++) { | 4618 | for (i = 0; i < count; i++) { |
4619 | cond_resched(); | ||
4615 | if (!bh) | 4620 | if (!bh) |
4616 | tbh = sb_find_get_block(inode->i_sb, | 4621 | tbh = sb_find_get_block(inode->i_sb, |
4617 | block + i); | 4622 | block + i); |
4618 | if (unlikely(!tbh)) | 4623 | if (!tbh) |
4619 | continue; | 4624 | continue; |
4620 | ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, | 4625 | ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, |
4621 | inode, tbh, block + i); | 4626 | inode, tbh, block + i); |
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 3dcbf364022f..e86dddbd8296 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c | |||
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
912 | struct page *pagep[2] = {NULL, NULL}; | 912 | struct page *pagep[2] = {NULL, NULL}; |
913 | handle_t *handle; | 913 | handle_t *handle; |
914 | ext4_lblk_t orig_blk_offset; | 914 | ext4_lblk_t orig_blk_offset; |
915 | long long offs = orig_page_offset << PAGE_CACHE_SHIFT; | ||
916 | unsigned long blocksize = orig_inode->i_sb->s_blocksize; | 915 | unsigned long blocksize = orig_inode->i_sb->s_blocksize; |
917 | unsigned int w_flags = 0; | 916 | unsigned int w_flags = 0; |
918 | unsigned int tmp_data_size, data_size, replaced_size; | 917 | unsigned int tmp_data_size, data_size, replaced_size; |
@@ -940,8 +939,6 @@ again: | |||
940 | orig_blk_offset = orig_page_offset * blocks_per_page + | 939 | orig_blk_offset = orig_page_offset * blocks_per_page + |
941 | data_offset_in_page; | 940 | data_offset_in_page; |
942 | 941 | ||
943 | offs = (long long)orig_blk_offset << orig_inode->i_blkbits; | ||
944 | |||
945 | /* Calculate data_size */ | 942 | /* Calculate data_size */ |
946 | if ((orig_blk_offset + block_len_in_page - 1) == | 943 | if ((orig_blk_offset + block_len_in_page - 1) == |
947 | ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { | 944 | ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6653fc35ecb7..ab2f6dc44b3a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, | |||
918 | bh->b_data, bh->b_size, | 918 | bh->b_data, bh->b_size, |
919 | (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) | 919 | (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) |
920 | + ((char *)de - bh->b_data))) { | 920 | + ((char *)de - bh->b_data))) { |
921 | /* On error, skip the f_pos to the next block. */ | 921 | /* silently ignore the rest of the block */ |
922 | dir_file->f_pos = (dir_file->f_pos | | 922 | break; |
923 | (dir->i_sb->s_blocksize - 1)) + 1; | ||
924 | brelse(bh); | ||
925 | return count; | ||
926 | } | 923 | } |
927 | ext4fs_dirhash(de->name, de->name_len, hinfo); | 924 | ext4fs_dirhash(de->name, de->name_len, hinfo); |
928 | if ((hinfo->hash < start_hash) || | 925 | if ((hinfo->hash < start_hash) || |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4acf1f78881b..48786cdb5e6c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -46,46 +46,121 @@ void ext4_exit_pageio(void) | |||
46 | } | 46 | } |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * This function is called by ext4_evict_inode() to make sure there is | 49 | * Print an buffer I/O error compatible with the fs/buffer.c. This |
50 | * no more pending I/O completion work left to do. | 50 | * provides compatibility with dmesg scrapers that look for a specific |
51 | * buffer I/O error message. We really need a unified error reporting | ||
52 | * structure to userspace ala Digital Unix's uerf system, but it's | ||
53 | * probably not going to happen in my lifetime, due to LKML politics... | ||
51 | */ | 54 | */ |
52 | void ext4_ioend_shutdown(struct inode *inode) | 55 | static void buffer_io_error(struct buffer_head *bh) |
53 | { | 56 | { |
54 | wait_queue_head_t *wq = ext4_ioend_wq(inode); | 57 | char b[BDEVNAME_SIZE]; |
58 | printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", | ||
59 | bdevname(bh->b_bdev, b), | ||
60 | (unsigned long long)bh->b_blocknr); | ||
61 | } | ||
55 | 62 | ||
56 | wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); | 63 | static void ext4_finish_bio(struct bio *bio) |
57 | /* | 64 | { |
58 | * We need to make sure the work structure is finished being | 65 | int i; |
59 | * used before we let the inode get destroyed. | 66 | int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); |
60 | */ | 67 | |
61 | if (work_pending(&EXT4_I(inode)->i_unwritten_work)) | 68 | for (i = 0; i < bio->bi_vcnt; i++) { |
62 | cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); | 69 | struct bio_vec *bvec = &bio->bi_io_vec[i]; |
70 | struct page *page = bvec->bv_page; | ||
71 | struct buffer_head *bh, *head; | ||
72 | unsigned bio_start = bvec->bv_offset; | ||
73 | unsigned bio_end = bio_start + bvec->bv_len; | ||
74 | unsigned under_io = 0; | ||
75 | unsigned long flags; | ||
76 | |||
77 | if (!page) | ||
78 | continue; | ||
79 | |||
80 | if (error) { | ||
81 | SetPageError(page); | ||
82 | set_bit(AS_EIO, &page->mapping->flags); | ||
83 | } | ||
84 | bh = head = page_buffers(page); | ||
85 | /* | ||
86 | * We check all buffers in the page under BH_Uptodate_Lock | ||
87 | * to avoid races with other end io clearing async_write flags | ||
88 | */ | ||
89 | local_irq_save(flags); | ||
90 | bit_spin_lock(BH_Uptodate_Lock, &head->b_state); | ||
91 | do { | ||
92 | if (bh_offset(bh) < bio_start || | ||
93 | bh_offset(bh) + bh->b_size > bio_end) { | ||
94 | if (buffer_async_write(bh)) | ||
95 | under_io++; | ||
96 | continue; | ||
97 | } | ||
98 | clear_buffer_async_write(bh); | ||
99 | if (error) | ||
100 | buffer_io_error(bh); | ||
101 | } while ((bh = bh->b_this_page) != head); | ||
102 | bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); | ||
103 | local_irq_restore(flags); | ||
104 | if (!under_io) | ||
105 | end_page_writeback(page); | ||
106 | } | ||
63 | } | 107 | } |
64 | 108 | ||
65 | void ext4_free_io_end(ext4_io_end_t *io) | 109 | static void ext4_release_io_end(ext4_io_end_t *io_end) |
66 | { | 110 | { |
67 | BUG_ON(!io); | 111 | struct bio *bio, *next_bio; |
68 | BUG_ON(!list_empty(&io->list)); | 112 | |
69 | BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); | 113 | BUG_ON(!list_empty(&io_end->list)); |
114 | BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); | ||
115 | WARN_ON(io_end->handle); | ||
70 | 116 | ||
71 | if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) | 117 | if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) |
72 | wake_up_all(ext4_ioend_wq(io->inode)); | 118 | wake_up_all(ext4_ioend_wq(io_end->inode)); |
73 | kmem_cache_free(io_end_cachep, io); | 119 | |
120 | for (bio = io_end->bio; bio; bio = next_bio) { | ||
121 | next_bio = bio->bi_private; | ||
122 | ext4_finish_bio(bio); | ||
123 | bio_put(bio); | ||
124 | } | ||
125 | if (io_end->flag & EXT4_IO_END_DIRECT) | ||
126 | inode_dio_done(io_end->inode); | ||
127 | if (io_end->iocb) | ||
128 | aio_complete(io_end->iocb, io_end->result, 0); | ||
129 | kmem_cache_free(io_end_cachep, io_end); | ||
74 | } | 130 | } |
75 | 131 | ||
76 | /* check a range of space and convert unwritten extents to written. */ | 132 | static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) |
133 | { | ||
134 | struct inode *inode = io_end->inode; | ||
135 | |||
136 | io_end->flag &= ~EXT4_IO_END_UNWRITTEN; | ||
137 | /* Wake up anyone waiting on unwritten extent conversion */ | ||
138 | if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) | ||
139 | wake_up_all(ext4_ioend_wq(inode)); | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * Check a range of space and convert unwritten extents to written. Note that | ||
144 | * we are protected from truncate touching same part of extent tree by the | ||
145 | * fact that truncate code waits for all DIO to finish (thus exclusion from | ||
146 | * direct IO is achieved) and also waits for PageWriteback bits. Thus we | ||
147 | * cannot get to ext4_ext_truncate() before all IOs overlapping that range are | ||
148 | * completed (happens from ext4_free_ioend()). | ||
149 | */ | ||
77 | static int ext4_end_io(ext4_io_end_t *io) | 150 | static int ext4_end_io(ext4_io_end_t *io) |
78 | { | 151 | { |
79 | struct inode *inode = io->inode; | 152 | struct inode *inode = io->inode; |
80 | loff_t offset = io->offset; | 153 | loff_t offset = io->offset; |
81 | ssize_t size = io->size; | 154 | ssize_t size = io->size; |
155 | handle_t *handle = io->handle; | ||
82 | int ret = 0; | 156 | int ret = 0; |
83 | 157 | ||
84 | ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," | 158 | ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," |
85 | "list->prev 0x%p\n", | 159 | "list->prev 0x%p\n", |
86 | io, inode->i_ino, io->list.next, io->list.prev); | 160 | io, inode->i_ino, io->list.next, io->list.prev); |
87 | 161 | ||
88 | ret = ext4_convert_unwritten_extents(inode, offset, size); | 162 | io->handle = NULL; /* Following call will use up the handle */ |
163 | ret = ext4_convert_unwritten_extents(handle, inode, offset, size); | ||
89 | if (ret < 0) { | 164 | if (ret < 0) { |
90 | ext4_msg(inode->i_sb, KERN_EMERG, | 165 | ext4_msg(inode->i_sb, KERN_EMERG, |
91 | "failed to convert unwritten extents to written " | 166 | "failed to convert unwritten extents to written " |
@@ -93,30 +168,22 @@ static int ext4_end_io(ext4_io_end_t *io) | |||
93 | "(inode %lu, offset %llu, size %zd, error %d)", | 168 | "(inode %lu, offset %llu, size %zd, error %d)", |
94 | inode->i_ino, offset, size, ret); | 169 | inode->i_ino, offset, size, ret); |
95 | } | 170 | } |
96 | /* Wake up anyone waiting on unwritten extent conversion */ | 171 | ext4_clear_io_unwritten_flag(io); |
97 | if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) | 172 | ext4_release_io_end(io); |
98 | wake_up_all(ext4_ioend_wq(inode)); | ||
99 | if (io->flag & EXT4_IO_END_DIRECT) | ||
100 | inode_dio_done(inode); | ||
101 | if (io->iocb) | ||
102 | aio_complete(io->iocb, io->result, 0); | ||
103 | return ret; | 173 | return ret; |
104 | } | 174 | } |
105 | 175 | ||
106 | static void dump_completed_IO(struct inode *inode) | 176 | static void dump_completed_IO(struct inode *inode, struct list_head *head) |
107 | { | 177 | { |
108 | #ifdef EXT4FS_DEBUG | 178 | #ifdef EXT4FS_DEBUG |
109 | struct list_head *cur, *before, *after; | 179 | struct list_head *cur, *before, *after; |
110 | ext4_io_end_t *io, *io0, *io1; | 180 | ext4_io_end_t *io, *io0, *io1; |
111 | 181 | ||
112 | if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { | 182 | if (list_empty(head)) |
113 | ext4_debug("inode %lu completed_io list is empty\n", | ||
114 | inode->i_ino); | ||
115 | return; | 183 | return; |
116 | } | ||
117 | 184 | ||
118 | ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); | 185 | ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); |
119 | list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { | 186 | list_for_each_entry(io, head, list) { |
120 | cur = &io->list; | 187 | cur = &io->list; |
121 | before = cur->prev; | 188 | before = cur->prev; |
122 | io0 = container_of(before, ext4_io_end_t, list); | 189 | io0 = container_of(before, ext4_io_end_t, list); |
@@ -130,23 +197,30 @@ static void dump_completed_IO(struct inode *inode) | |||
130 | } | 197 | } |
131 | 198 | ||
132 | /* Add the io_end to per-inode completed end_io list. */ | 199 | /* Add the io_end to per-inode completed end_io list. */ |
133 | void ext4_add_complete_io(ext4_io_end_t *io_end) | 200 | static void ext4_add_complete_io(ext4_io_end_t *io_end) |
134 | { | 201 | { |
135 | struct ext4_inode_info *ei = EXT4_I(io_end->inode); | 202 | struct ext4_inode_info *ei = EXT4_I(io_end->inode); |
136 | struct workqueue_struct *wq; | 203 | struct workqueue_struct *wq; |
137 | unsigned long flags; | 204 | unsigned long flags; |
138 | 205 | ||
139 | BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); | 206 | BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); |
140 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
141 | |||
142 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | 207 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
143 | if (list_empty(&ei->i_completed_io_list)) | 208 | if (io_end->handle) { |
144 | queue_work(wq, &ei->i_unwritten_work); | 209 | wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; |
145 | list_add_tail(&io_end->list, &ei->i_completed_io_list); | 210 | if (list_empty(&ei->i_rsv_conversion_list)) |
211 | queue_work(wq, &ei->i_rsv_conversion_work); | ||
212 | list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); | ||
213 | } else { | ||
214 | wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; | ||
215 | if (list_empty(&ei->i_unrsv_conversion_list)) | ||
216 | queue_work(wq, &ei->i_unrsv_conversion_work); | ||
217 | list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); | ||
218 | } | ||
146 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | 219 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
147 | } | 220 | } |
148 | 221 | ||
149 | static int ext4_do_flush_completed_IO(struct inode *inode) | 222 | static int ext4_do_flush_completed_IO(struct inode *inode, |
223 | struct list_head *head) | ||
150 | { | 224 | { |
151 | ext4_io_end_t *io; | 225 | ext4_io_end_t *io; |
152 | struct list_head unwritten; | 226 | struct list_head unwritten; |
@@ -155,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode) | |||
155 | int err, ret = 0; | 229 | int err, ret = 0; |
156 | 230 | ||
157 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | 231 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
158 | dump_completed_IO(inode); | 232 | dump_completed_IO(inode, head); |
159 | list_replace_init(&ei->i_completed_io_list, &unwritten); | 233 | list_replace_init(head, &unwritten); |
160 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | 234 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
161 | 235 | ||
162 | while (!list_empty(&unwritten)) { | 236 | while (!list_empty(&unwritten)) { |
@@ -167,30 +241,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode) | |||
167 | err = ext4_end_io(io); | 241 | err = ext4_end_io(io); |
168 | if (unlikely(!ret && err)) | 242 | if (unlikely(!ret && err)) |
169 | ret = err; | 243 | ret = err; |
170 | io->flag &= ~EXT4_IO_END_UNWRITTEN; | ||
171 | ext4_free_io_end(io); | ||
172 | } | 244 | } |
173 | return ret; | 245 | return ret; |
174 | } | 246 | } |
175 | 247 | ||
176 | /* | 248 | /* |
177 | * work on completed aio dio IO, to convert unwritten extents to extents | 249 | * work on completed IO, to convert unwritten extents to extents |
178 | */ | 250 | */ |
179 | void ext4_end_io_work(struct work_struct *work) | 251 | void ext4_end_io_rsv_work(struct work_struct *work) |
180 | { | 252 | { |
181 | struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, | 253 | struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, |
182 | i_unwritten_work); | 254 | i_rsv_conversion_work); |
183 | ext4_do_flush_completed_IO(&ei->vfs_inode); | 255 | ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); |
184 | } | 256 | } |
185 | 257 | ||
186 | int ext4_flush_unwritten_io(struct inode *inode) | 258 | void ext4_end_io_unrsv_work(struct work_struct *work) |
187 | { | 259 | { |
188 | int ret; | 260 | struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, |
189 | WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && | 261 | i_unrsv_conversion_work); |
190 | !(inode->i_state & I_FREEING)); | 262 | ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); |
191 | ret = ext4_do_flush_completed_IO(inode); | ||
192 | ext4_unwritten_wait(inode); | ||
193 | return ret; | ||
194 | } | 263 | } |
195 | 264 | ||
196 | ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | 265 | ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) |
@@ -200,83 +269,70 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | |||
200 | atomic_inc(&EXT4_I(inode)->i_ioend_count); | 269 | atomic_inc(&EXT4_I(inode)->i_ioend_count); |
201 | io->inode = inode; | 270 | io->inode = inode; |
202 | INIT_LIST_HEAD(&io->list); | 271 | INIT_LIST_HEAD(&io->list); |
272 | atomic_set(&io->count, 1); | ||
203 | } | 273 | } |
204 | return io; | 274 | return io; |
205 | } | 275 | } |
206 | 276 | ||
207 | /* | 277 | void ext4_put_io_end_defer(ext4_io_end_t *io_end) |
208 | * Print an buffer I/O error compatible with the fs/buffer.c. This | ||
209 | * provides compatibility with dmesg scrapers that look for a specific | ||
210 | * buffer I/O error message. We really need a unified error reporting | ||
211 | * structure to userspace ala Digital Unix's uerf system, but it's | ||
212 | * probably not going to happen in my lifetime, due to LKML politics... | ||
213 | */ | ||
214 | static void buffer_io_error(struct buffer_head *bh) | ||
215 | { | 278 | { |
216 | char b[BDEVNAME_SIZE]; | 279 | if (atomic_dec_and_test(&io_end->count)) { |
217 | printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", | 280 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { |
218 | bdevname(bh->b_bdev, b), | 281 | ext4_release_io_end(io_end); |
219 | (unsigned long long)bh->b_blocknr); | 282 | return; |
283 | } | ||
284 | ext4_add_complete_io(io_end); | ||
285 | } | ||
286 | } | ||
287 | |||
288 | int ext4_put_io_end(ext4_io_end_t *io_end) | ||
289 | { | ||
290 | int err = 0; | ||
291 | |||
292 | if (atomic_dec_and_test(&io_end->count)) { | ||
293 | if (io_end->flag & EXT4_IO_END_UNWRITTEN) { | ||
294 | err = ext4_convert_unwritten_extents(io_end->handle, | ||
295 | io_end->inode, io_end->offset, | ||
296 | io_end->size); | ||
297 | io_end->handle = NULL; | ||
298 | ext4_clear_io_unwritten_flag(io_end); | ||
299 | } | ||
300 | ext4_release_io_end(io_end); | ||
301 | } | ||
302 | return err; | ||
303 | } | ||
304 | |||
305 | ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) | ||
306 | { | ||
307 | atomic_inc(&io_end->count); | ||
308 | return io_end; | ||
220 | } | 309 | } |
221 | 310 | ||
222 | static void ext4_end_bio(struct bio *bio, int error) | 311 | static void ext4_end_bio(struct bio *bio, int error) |
223 | { | 312 | { |
224 | ext4_io_end_t *io_end = bio->bi_private; | 313 | ext4_io_end_t *io_end = bio->bi_private; |
225 | struct inode *inode; | ||
226 | int i; | ||
227 | int blocksize; | ||
228 | sector_t bi_sector = bio->bi_sector; | 314 | sector_t bi_sector = bio->bi_sector; |
229 | 315 | ||
230 | BUG_ON(!io_end); | 316 | BUG_ON(!io_end); |
231 | inode = io_end->inode; | ||
232 | blocksize = 1 << inode->i_blkbits; | ||
233 | bio->bi_private = NULL; | ||
234 | bio->bi_end_io = NULL; | 317 | bio->bi_end_io = NULL; |
235 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 318 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
236 | error = 0; | 319 | error = 0; |
237 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
238 | struct bio_vec *bvec = &bio->bi_io_vec[i]; | ||
239 | struct page *page = bvec->bv_page; | ||
240 | struct buffer_head *bh, *head; | ||
241 | unsigned bio_start = bvec->bv_offset; | ||
242 | unsigned bio_end = bio_start + bvec->bv_len; | ||
243 | unsigned under_io = 0; | ||
244 | unsigned long flags; | ||
245 | 320 | ||
246 | if (!page) | 321 | if (io_end->flag & EXT4_IO_END_UNWRITTEN) { |
247 | continue; | ||
248 | |||
249 | if (error) { | ||
250 | SetPageError(page); | ||
251 | set_bit(AS_EIO, &page->mapping->flags); | ||
252 | } | ||
253 | bh = head = page_buffers(page); | ||
254 | /* | 322 | /* |
255 | * We check all buffers in the page under BH_Uptodate_Lock | 323 | * Link bio into list hanging from io_end. We have to do it |
256 | * to avoid races with other end io clearing async_write flags | 324 | * atomically as bio completions can be racing against each |
325 | * other. | ||
257 | */ | 326 | */ |
258 | local_irq_save(flags); | 327 | bio->bi_private = xchg(&io_end->bio, bio); |
259 | bit_spin_lock(BH_Uptodate_Lock, &head->b_state); | 328 | } else { |
260 | do { | 329 | ext4_finish_bio(bio); |
261 | if (bh_offset(bh) < bio_start || | 330 | bio_put(bio); |
262 | bh_offset(bh) + blocksize > bio_end) { | ||
263 | if (buffer_async_write(bh)) | ||
264 | under_io++; | ||
265 | continue; | ||
266 | } | ||
267 | clear_buffer_async_write(bh); | ||
268 | if (error) | ||
269 | buffer_io_error(bh); | ||
270 | } while ((bh = bh->b_this_page) != head); | ||
271 | bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); | ||
272 | local_irq_restore(flags); | ||
273 | if (!under_io) | ||
274 | end_page_writeback(page); | ||
275 | } | 331 | } |
276 | bio_put(bio); | ||
277 | 332 | ||
278 | if (error) { | 333 | if (error) { |
279 | io_end->flag |= EXT4_IO_END_ERROR; | 334 | struct inode *inode = io_end->inode; |
335 | |||
280 | ext4_warning(inode->i_sb, "I/O error writing to inode %lu " | 336 | ext4_warning(inode->i_sb, "I/O error writing to inode %lu " |
281 | "(offset %llu size %ld starting block %llu)", | 337 | "(offset %llu size %ld starting block %llu)", |
282 | inode->i_ino, | 338 | inode->i_ino, |
@@ -285,13 +341,7 @@ static void ext4_end_bio(struct bio *bio, int error) | |||
285 | (unsigned long long) | 341 | (unsigned long long) |
286 | bi_sector >> (inode->i_blkbits - 9)); | 342 | bi_sector >> (inode->i_blkbits - 9)); |
287 | } | 343 | } |
288 | 344 | ext4_put_io_end_defer(io_end); | |
289 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | ||
290 | ext4_free_io_end(io_end); | ||
291 | return; | ||
292 | } | ||
293 | |||
294 | ext4_add_complete_io(io_end); | ||
295 | } | 345 | } |
296 | 346 | ||
297 | void ext4_io_submit(struct ext4_io_submit *io) | 347 | void ext4_io_submit(struct ext4_io_submit *io) |
@@ -305,43 +355,38 @@ void ext4_io_submit(struct ext4_io_submit *io) | |||
305 | bio_put(io->io_bio); | 355 | bio_put(io->io_bio); |
306 | } | 356 | } |
307 | io->io_bio = NULL; | 357 | io->io_bio = NULL; |
308 | io->io_op = 0; | 358 | } |
359 | |||
360 | void ext4_io_submit_init(struct ext4_io_submit *io, | ||
361 | struct writeback_control *wbc) | ||
362 | { | ||
363 | io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); | ||
364 | io->io_bio = NULL; | ||
309 | io->io_end = NULL; | 365 | io->io_end = NULL; |
310 | } | 366 | } |
311 | 367 | ||
312 | static int io_submit_init(struct ext4_io_submit *io, | 368 | static int io_submit_init_bio(struct ext4_io_submit *io, |
313 | struct inode *inode, | 369 | struct buffer_head *bh) |
314 | struct writeback_control *wbc, | ||
315 | struct buffer_head *bh) | ||
316 | { | 370 | { |
317 | ext4_io_end_t *io_end; | ||
318 | struct page *page = bh->b_page; | ||
319 | int nvecs = bio_get_nr_vecs(bh->b_bdev); | 371 | int nvecs = bio_get_nr_vecs(bh->b_bdev); |
320 | struct bio *bio; | 372 | struct bio *bio; |
321 | 373 | ||
322 | io_end = ext4_init_io_end(inode, GFP_NOFS); | ||
323 | if (!io_end) | ||
324 | return -ENOMEM; | ||
325 | bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); | 374 | bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); |
375 | if (!bio) | ||
376 | return -ENOMEM; | ||
326 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 377 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
327 | bio->bi_bdev = bh->b_bdev; | 378 | bio->bi_bdev = bh->b_bdev; |
328 | bio->bi_private = io->io_end = io_end; | ||
329 | bio->bi_end_io = ext4_end_bio; | 379 | bio->bi_end_io = ext4_end_bio; |
330 | 380 | bio->bi_private = ext4_get_io_end(io->io_end); | |
331 | io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); | ||
332 | |||
333 | io->io_bio = bio; | 381 | io->io_bio = bio; |
334 | io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); | ||
335 | io->io_next_block = bh->b_blocknr; | 382 | io->io_next_block = bh->b_blocknr; |
336 | return 0; | 383 | return 0; |
337 | } | 384 | } |
338 | 385 | ||
339 | static int io_submit_add_bh(struct ext4_io_submit *io, | 386 | static int io_submit_add_bh(struct ext4_io_submit *io, |
340 | struct inode *inode, | 387 | struct inode *inode, |
341 | struct writeback_control *wbc, | ||
342 | struct buffer_head *bh) | 388 | struct buffer_head *bh) |
343 | { | 389 | { |
344 | ext4_io_end_t *io_end; | ||
345 | int ret; | 390 | int ret; |
346 | 391 | ||
347 | if (io->io_bio && bh->b_blocknr != io->io_next_block) { | 392 | if (io->io_bio && bh->b_blocknr != io->io_next_block) { |
@@ -349,18 +394,14 @@ submit_and_retry: | |||
349 | ext4_io_submit(io); | 394 | ext4_io_submit(io); |
350 | } | 395 | } |
351 | if (io->io_bio == NULL) { | 396 | if (io->io_bio == NULL) { |
352 | ret = io_submit_init(io, inode, wbc, bh); | 397 | ret = io_submit_init_bio(io, bh); |
353 | if (ret) | 398 | if (ret) |
354 | return ret; | 399 | return ret; |
355 | } | 400 | } |
356 | io_end = io->io_end; | ||
357 | if (test_clear_buffer_uninit(bh)) | ||
358 | ext4_set_io_unwritten_flag(inode, io_end); | ||
359 | io->io_end->size += bh->b_size; | ||
360 | io->io_next_block++; | ||
361 | ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); | 401 | ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); |
362 | if (ret != bh->b_size) | 402 | if (ret != bh->b_size) |
363 | goto submit_and_retry; | 403 | goto submit_and_retry; |
404 | io->io_next_block++; | ||
364 | return 0; | 405 | return 0; |
365 | } | 406 | } |
366 | 407 | ||
@@ -432,7 +473,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
432 | do { | 473 | do { |
433 | if (!buffer_async_write(bh)) | 474 | if (!buffer_async_write(bh)) |
434 | continue; | 475 | continue; |
435 | ret = io_submit_add_bh(io, inode, wbc, bh); | 476 | ret = io_submit_add_bh(io, inode, bh); |
436 | if (ret) { | 477 | if (ret) { |
437 | /* | 478 | /* |
438 | * We only get here on ENOMEM. Not much else | 479 | * We only get here on ENOMEM. Not much else |
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b27c96d01965..c5adbb318a90 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c | |||
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb, | |||
79 | ext4_fsblk_t end = start + input->blocks_count; | 79 | ext4_fsblk_t end = start + input->blocks_count; |
80 | ext4_group_t group = input->group; | 80 | ext4_group_t group = input->group; |
81 | ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; | 81 | ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; |
82 | unsigned overhead = ext4_group_overhead_blocks(sb, group); | 82 | unsigned overhead; |
83 | ext4_fsblk_t metaend = start + overhead; | 83 | ext4_fsblk_t metaend; |
84 | struct buffer_head *bh = NULL; | 84 | struct buffer_head *bh = NULL; |
85 | ext4_grpblk_t free_blocks_count, offset; | 85 | ext4_grpblk_t free_blocks_count, offset; |
86 | int err = -EINVAL; | 86 | int err = -EINVAL; |
87 | 87 | ||
88 | if (group != sbi->s_groups_count) { | ||
89 | ext4_warning(sb, "Cannot add at group %u (only %u groups)", | ||
90 | input->group, sbi->s_groups_count); | ||
91 | return -EINVAL; | ||
92 | } | ||
93 | |||
94 | overhead = ext4_group_overhead_blocks(sb, group); | ||
95 | metaend = start + overhead; | ||
88 | input->free_blocks_count = free_blocks_count = | 96 | input->free_blocks_count = free_blocks_count = |
89 | input->blocks_count - 2 - overhead - sbi->s_itb_per_group; | 97 | input->blocks_count - 2 - overhead - sbi->s_itb_per_group; |
90 | 98 | ||
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb, | |||
96 | free_blocks_count, input->reserved_blocks); | 104 | free_blocks_count, input->reserved_blocks); |
97 | 105 | ||
98 | ext4_get_group_no_and_offset(sb, start, NULL, &offset); | 106 | ext4_get_group_no_and_offset(sb, start, NULL, &offset); |
99 | if (group != sbi->s_groups_count) | 107 | if (offset != 0) |
100 | ext4_warning(sb, "Cannot add at group %u (only %u groups)", | ||
101 | input->group, sbi->s_groups_count); | ||
102 | else if (offset != 0) | ||
103 | ext4_warning(sb, "Last group not full"); | 108 | ext4_warning(sb, "Last group not full"); |
104 | else if (input->reserved_blocks > input->blocks_count / 5) | 109 | else if (input->reserved_blocks > input->blocks_count / 5) |
105 | ext4_warning(sb, "Reserved blocks too high (%u)", | 110 | ext4_warning(sb, "Reserved blocks too high (%u)", |
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
1551 | int reserved_gdb = ext4_bg_has_super(sb, input->group) ? | 1556 | int reserved_gdb = ext4_bg_has_super(sb, input->group) ? |
1552 | le16_to_cpu(es->s_reserved_gdt_blocks) : 0; | 1557 | le16_to_cpu(es->s_reserved_gdt_blocks) : 0; |
1553 | struct inode *inode = NULL; | 1558 | struct inode *inode = NULL; |
1554 | int gdb_off, gdb_num; | 1559 | int gdb_off; |
1555 | int err; | 1560 | int err; |
1556 | __u16 bg_flags = 0; | 1561 | __u16 bg_flags = 0; |
1557 | 1562 | ||
1558 | gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); | ||
1559 | gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); | 1563 | gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); |
1560 | 1564 | ||
1561 | if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, | 1565 | if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, |
@@ -1656,12 +1660,10 @@ errout: | |||
1656 | err = err2; | 1660 | err = err2; |
1657 | 1661 | ||
1658 | if (!err) { | 1662 | if (!err) { |
1659 | ext4_fsblk_t first_block; | ||
1660 | first_block = ext4_group_first_block_no(sb, 0); | ||
1661 | if (test_opt(sb, DEBUG)) | 1663 | if (test_opt(sb, DEBUG)) |
1662 | printk(KERN_DEBUG "EXT4-fs: extended group to %llu " | 1664 | printk(KERN_DEBUG "EXT4-fs: extended group to %llu " |
1663 | "blocks\n", ext4_blocks_count(es)); | 1665 | "blocks\n", ext4_blocks_count(es)); |
1664 | update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, | 1666 | update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, |
1665 | (char *)es, sizeof(struct ext4_super_block), 0); | 1667 | (char *)es, sizeof(struct ext4_super_block), 0); |
1666 | } | 1668 | } |
1667 | return err; | 1669 | return err; |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 94cc84db7c9a..85b3dd60169b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, | |||
69 | static void ext4_clear_journal_err(struct super_block *sb, | 69 | static void ext4_clear_journal_err(struct super_block *sb, |
70 | struct ext4_super_block *es); | 70 | struct ext4_super_block *es); |
71 | static int ext4_sync_fs(struct super_block *sb, int wait); | 71 | static int ext4_sync_fs(struct super_block *sb, int wait); |
72 | static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); | ||
72 | static int ext4_remount(struct super_block *sb, int *flags, char *data); | 73 | static int ext4_remount(struct super_block *sb, int *flags, char *data); |
73 | static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); | 74 | static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); |
74 | static int ext4_unfreeze(struct super_block *sb); | 75 | static int ext4_unfreeze(struct super_block *sb); |
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb) | |||
398 | } | 399 | } |
399 | if (test_opt(sb, ERRORS_RO)) { | 400 | if (test_opt(sb, ERRORS_RO)) { |
400 | ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); | 401 | ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); |
402 | /* | ||
403 | * Make sure updated value of ->s_mount_flags will be visible | ||
404 | * before ->s_flags update | ||
405 | */ | ||
406 | smp_wmb(); | ||
401 | sb->s_flags |= MS_RDONLY; | 407 | sb->s_flags |= MS_RDONLY; |
402 | } | 408 | } |
403 | if (test_opt(sb, ERRORS_PANIC)) | 409 | if (test_opt(sb, ERRORS_PANIC)) |
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function, | |||
422 | ext4_handle_error(sb); | 428 | ext4_handle_error(sb); |
423 | } | 429 | } |
424 | 430 | ||
425 | void ext4_error_inode(struct inode *inode, const char *function, | 431 | void __ext4_error_inode(struct inode *inode, const char *function, |
426 | unsigned int line, ext4_fsblk_t block, | 432 | unsigned int line, ext4_fsblk_t block, |
427 | const char *fmt, ...) | 433 | const char *fmt, ...) |
428 | { | 434 | { |
429 | va_list args; | 435 | va_list args; |
430 | struct va_format vaf; | 436 | struct va_format vaf; |
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function, | |||
451 | ext4_handle_error(inode->i_sb); | 457 | ext4_handle_error(inode->i_sb); |
452 | } | 458 | } |
453 | 459 | ||
454 | void ext4_error_file(struct file *file, const char *function, | 460 | void __ext4_error_file(struct file *file, const char *function, |
455 | unsigned int line, ext4_fsblk_t block, | 461 | unsigned int line, ext4_fsblk_t block, |
456 | const char *fmt, ...) | 462 | const char *fmt, ...) |
457 | { | 463 | { |
458 | va_list args; | 464 | va_list args; |
459 | struct va_format vaf; | 465 | struct va_format vaf; |
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function, | |||
570 | 576 | ||
571 | if ((sb->s_flags & MS_RDONLY) == 0) { | 577 | if ((sb->s_flags & MS_RDONLY) == 0) { |
572 | ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); | 578 | ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); |
573 | sb->s_flags |= MS_RDONLY; | ||
574 | EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; | 579 | EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; |
580 | /* | ||
581 | * Make sure updated value of ->s_mount_flags will be visible | ||
582 | * before ->s_flags update | ||
583 | */ | ||
584 | smp_wmb(); | ||
585 | sb->s_flags |= MS_RDONLY; | ||
575 | if (EXT4_SB(sb)->s_journal) | 586 | if (EXT4_SB(sb)->s_journal) |
576 | jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); | 587 | jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); |
577 | save_error_info(sb, function, line); | 588 | save_error_info(sb, function, line); |
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function, | |||
580 | panic("EXT4-fs panic from previous error\n"); | 591 | panic("EXT4-fs panic from previous error\n"); |
581 | } | 592 | } |
582 | 593 | ||
583 | void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) | 594 | void __ext4_msg(struct super_block *sb, |
595 | const char *prefix, const char *fmt, ...) | ||
584 | { | 596 | { |
585 | struct va_format vaf; | 597 | struct va_format vaf; |
586 | va_list args; | 598 | va_list args; |
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb) | |||
750 | ext4_unregister_li_request(sb); | 762 | ext4_unregister_li_request(sb); |
751 | dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); | 763 | dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); |
752 | 764 | ||
753 | flush_workqueue(sbi->dio_unwritten_wq); | 765 | flush_workqueue(sbi->unrsv_conversion_wq); |
754 | destroy_workqueue(sbi->dio_unwritten_wq); | 766 | flush_workqueue(sbi->rsv_conversion_wq); |
767 | destroy_workqueue(sbi->unrsv_conversion_wq); | ||
768 | destroy_workqueue(sbi->rsv_conversion_wq); | ||
755 | 769 | ||
756 | if (sbi->s_journal) { | 770 | if (sbi->s_journal) { |
757 | err = jbd2_journal_destroy(sbi->s_journal); | 771 | err = jbd2_journal_destroy(sbi->s_journal); |
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb) | |||
760 | ext4_abort(sb, "Couldn't clean up the journal"); | 774 | ext4_abort(sb, "Couldn't clean up the journal"); |
761 | } | 775 | } |
762 | 776 | ||
763 | ext4_es_unregister_shrinker(sb); | 777 | ext4_es_unregister_shrinker(sbi); |
764 | del_timer(&sbi->s_err_report); | 778 | del_timer(&sbi->s_err_report); |
765 | ext4_release_system_zone(sb); | 779 | ext4_release_system_zone(sb); |
766 | ext4_mb_release(sb); | 780 | ext4_mb_release(sb); |
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
849 | rwlock_init(&ei->i_es_lock); | 863 | rwlock_init(&ei->i_es_lock); |
850 | INIT_LIST_HEAD(&ei->i_es_lru); | 864 | INIT_LIST_HEAD(&ei->i_es_lru); |
851 | ei->i_es_lru_nr = 0; | 865 | ei->i_es_lru_nr = 0; |
866 | ei->i_touch_when = 0; | ||
852 | ei->i_reserved_data_blocks = 0; | 867 | ei->i_reserved_data_blocks = 0; |
853 | ei->i_reserved_meta_blocks = 0; | 868 | ei->i_reserved_meta_blocks = 0; |
854 | ei->i_allocated_meta_blocks = 0; | 869 | ei->i_allocated_meta_blocks = 0; |
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
859 | ei->i_reserved_quota = 0; | 874 | ei->i_reserved_quota = 0; |
860 | #endif | 875 | #endif |
861 | ei->jinode = NULL; | 876 | ei->jinode = NULL; |
862 | INIT_LIST_HEAD(&ei->i_completed_io_list); | 877 | INIT_LIST_HEAD(&ei->i_rsv_conversion_list); |
878 | INIT_LIST_HEAD(&ei->i_unrsv_conversion_list); | ||
863 | spin_lock_init(&ei->i_completed_io_lock); | 879 | spin_lock_init(&ei->i_completed_io_lock); |
864 | ei->i_sync_tid = 0; | 880 | ei->i_sync_tid = 0; |
865 | ei->i_datasync_tid = 0; | 881 | ei->i_datasync_tid = 0; |
866 | atomic_set(&ei->i_ioend_count, 0); | 882 | atomic_set(&ei->i_ioend_count, 0); |
867 | atomic_set(&ei->i_unwritten, 0); | 883 | atomic_set(&ei->i_unwritten, 0); |
868 | INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); | 884 | INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); |
885 | INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work); | ||
869 | 886 | ||
870 | return &ei->vfs_inode; | 887 | return &ei->vfs_inode; |
871 | } | 888 | } |
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = { | |||
1093 | .dirty_inode = ext4_dirty_inode, | 1110 | .dirty_inode = ext4_dirty_inode, |
1094 | .drop_inode = ext4_drop_inode, | 1111 | .drop_inode = ext4_drop_inode, |
1095 | .evict_inode = ext4_evict_inode, | 1112 | .evict_inode = ext4_evict_inode, |
1113 | .sync_fs = ext4_sync_fs_nojournal, | ||
1096 | .put_super = ext4_put_super, | 1114 | .put_super = ext4_put_super, |
1097 | .statfs = ext4_statfs, | 1115 | .statfs = ext4_statfs, |
1098 | .remount_fs = ext4_remount, | 1116 | .remount_fs = ext4_remount, |
@@ -1908,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb) | |||
1908 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1926 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1909 | struct ext4_group_desc *gdp = NULL; | 1927 | struct ext4_group_desc *gdp = NULL; |
1910 | ext4_group_t flex_group; | 1928 | ext4_group_t flex_group; |
1911 | unsigned int groups_per_flex = 0; | ||
1912 | int i, err; | 1929 | int i, err; |
1913 | 1930 | ||
1914 | sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; | 1931 | sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; |
@@ -1916,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb) | |||
1916 | sbi->s_log_groups_per_flex = 0; | 1933 | sbi->s_log_groups_per_flex = 0; |
1917 | return 1; | 1934 | return 1; |
1918 | } | 1935 | } |
1919 | groups_per_flex = 1U << sbi->s_log_groups_per_flex; | ||
1920 | 1936 | ||
1921 | err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); | 1937 | err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); |
1922 | if (err) | 1938 | if (err) |
@@ -2164,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb, | |||
2164 | list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); | 2180 | list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); |
2165 | dquot_initialize(inode); | 2181 | dquot_initialize(inode); |
2166 | if (inode->i_nlink) { | 2182 | if (inode->i_nlink) { |
2167 | ext4_msg(sb, KERN_DEBUG, | 2183 | if (test_opt(sb, DEBUG)) |
2168 | "%s: truncating inode %lu to %lld bytes", | 2184 | ext4_msg(sb, KERN_DEBUG, |
2169 | __func__, inode->i_ino, inode->i_size); | 2185 | "%s: truncating inode %lu to %lld bytes", |
2186 | __func__, inode->i_ino, inode->i_size); | ||
2170 | jbd_debug(2, "truncating inode %lu to %lld bytes\n", | 2187 | jbd_debug(2, "truncating inode %lu to %lld bytes\n", |
2171 | inode->i_ino, inode->i_size); | 2188 | inode->i_ino, inode->i_size); |
2172 | mutex_lock(&inode->i_mutex); | 2189 | mutex_lock(&inode->i_mutex); |
2190 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
2173 | ext4_truncate(inode); | 2191 | ext4_truncate(inode); |
2174 | mutex_unlock(&inode->i_mutex); | 2192 | mutex_unlock(&inode->i_mutex); |
2175 | nr_truncates++; | 2193 | nr_truncates++; |
2176 | } else { | 2194 | } else { |
2177 | ext4_msg(sb, KERN_DEBUG, | 2195 | if (test_opt(sb, DEBUG)) |
2178 | "%s: deleting unreferenced inode %lu", | 2196 | ext4_msg(sb, KERN_DEBUG, |
2179 | __func__, inode->i_ino); | 2197 | "%s: deleting unreferenced inode %lu", |
2198 | __func__, inode->i_ino); | ||
2180 | jbd_debug(2, "deleting unreferenced inode %lu\n", | 2199 | jbd_debug(2, "deleting unreferenced inode %lu\n", |
2181 | inode->i_ino); | 2200 | inode->i_ino); |
2182 | nr_orphans++; | 2201 | nr_orphans++; |
@@ -2377,7 +2396,10 @@ struct ext4_attr { | |||
2377 | ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); | 2396 | ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); |
2378 | ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, | 2397 | ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, |
2379 | const char *, size_t); | 2398 | const char *, size_t); |
2380 | int offset; | 2399 | union { |
2400 | int offset; | ||
2401 | int deprecated_val; | ||
2402 | } u; | ||
2381 | }; | 2403 | }; |
2382 | 2404 | ||
2383 | static int parse_strtoull(const char *buf, | 2405 | static int parse_strtoull(const char *buf, |
@@ -2446,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, | |||
2446 | static ssize_t sbi_ui_show(struct ext4_attr *a, | 2468 | static ssize_t sbi_ui_show(struct ext4_attr *a, |
2447 | struct ext4_sb_info *sbi, char *buf) | 2469 | struct ext4_sb_info *sbi, char *buf) |
2448 | { | 2470 | { |
2449 | unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); | 2471 | unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); |
2450 | 2472 | ||
2451 | return snprintf(buf, PAGE_SIZE, "%u\n", *ui); | 2473 | return snprintf(buf, PAGE_SIZE, "%u\n", *ui); |
2452 | } | 2474 | } |
@@ -2455,7 +2477,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, | |||
2455 | struct ext4_sb_info *sbi, | 2477 | struct ext4_sb_info *sbi, |
2456 | const char *buf, size_t count) | 2478 | const char *buf, size_t count) |
2457 | { | 2479 | { |
2458 | unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); | 2480 | unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); |
2459 | unsigned long t; | 2481 | unsigned long t; |
2460 | int ret; | 2482 | int ret; |
2461 | 2483 | ||
@@ -2504,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a, | |||
2504 | return count; | 2526 | return count; |
2505 | } | 2527 | } |
2506 | 2528 | ||
2529 | static ssize_t sbi_deprecated_show(struct ext4_attr *a, | ||
2530 | struct ext4_sb_info *sbi, char *buf) | ||
2531 | { | ||
2532 | return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); | ||
2533 | } | ||
2534 | |||
2507 | #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ | 2535 | #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ |
2508 | static struct ext4_attr ext4_attr_##_name = { \ | 2536 | static struct ext4_attr ext4_attr_##_name = { \ |
2509 | .attr = {.name = __stringify(_name), .mode = _mode }, \ | 2537 | .attr = {.name = __stringify(_name), .mode = _mode }, \ |
2510 | .show = _show, \ | 2538 | .show = _show, \ |
2511 | .store = _store, \ | 2539 | .store = _store, \ |
2512 | .offset = offsetof(struct ext4_sb_info, _elname), \ | 2540 | .u = { \ |
2541 | .offset = offsetof(struct ext4_sb_info, _elname),\ | ||
2542 | }, \ | ||
2513 | } | 2543 | } |
2514 | #define EXT4_ATTR(name, mode, show, store) \ | 2544 | #define EXT4_ATTR(name, mode, show, store) \ |
2515 | static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) | 2545 | static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) |
@@ -2520,6 +2550,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) | |||
2520 | #define EXT4_RW_ATTR_SBI_UI(name, elname) \ | 2550 | #define EXT4_RW_ATTR_SBI_UI(name, elname) \ |
2521 | EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) | 2551 | EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) |
2522 | #define ATTR_LIST(name) &ext4_attr_##name.attr | 2552 | #define ATTR_LIST(name) &ext4_attr_##name.attr |
2553 | #define EXT4_DEPRECATED_ATTR(_name, _val) \ | ||
2554 | static struct ext4_attr ext4_attr_##_name = { \ | ||
2555 | .attr = {.name = __stringify(_name), .mode = 0444 }, \ | ||
2556 | .show = sbi_deprecated_show, \ | ||
2557 | .u = { \ | ||
2558 | .deprecated_val = _val, \ | ||
2559 | }, \ | ||
2560 | } | ||
2523 | 2561 | ||
2524 | EXT4_RO_ATTR(delayed_allocation_blocks); | 2562 | EXT4_RO_ATTR(delayed_allocation_blocks); |
2525 | EXT4_RO_ATTR(session_write_kbytes); | 2563 | EXT4_RO_ATTR(session_write_kbytes); |
@@ -2534,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); | |||
2534 | EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); | 2572 | EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); |
2535 | EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); | 2573 | EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); |
2536 | EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); | 2574 | EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); |
2537 | EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); | 2575 | EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); |
2538 | EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); | 2576 | EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); |
2539 | EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); | 2577 | EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); |
2540 | 2578 | ||
@@ -3763,7 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3763 | sbi->s_err_report.data = (unsigned long) sb; | 3801 | sbi->s_err_report.data = (unsigned long) sb; |
3764 | 3802 | ||
3765 | /* Register extent status tree shrinker */ | 3803 | /* Register extent status tree shrinker */ |
3766 | ext4_es_register_shrinker(sb); | 3804 | ext4_es_register_shrinker(sbi); |
3767 | 3805 | ||
3768 | err = percpu_counter_init(&sbi->s_freeclusters_counter, | 3806 | err = percpu_counter_init(&sbi->s_freeclusters_counter, |
3769 | ext4_count_free_clusters(sb)); | 3807 | ext4_count_free_clusters(sb)); |
@@ -3787,7 +3825,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3787 | } | 3825 | } |
3788 | 3826 | ||
3789 | sbi->s_stripe = ext4_get_stripe_size(sbi); | 3827 | sbi->s_stripe = ext4_get_stripe_size(sbi); |
3790 | sbi->s_max_writeback_mb_bump = 128; | ||
3791 | sbi->s_extent_max_zeroout_kb = 32; | 3828 | sbi->s_extent_max_zeroout_kb = 32; |
3792 | 3829 | ||
3793 | /* | 3830 | /* |
@@ -3915,12 +3952,20 @@ no_journal: | |||
3915 | * The maximum number of concurrent works can be high and | 3952 | * The maximum number of concurrent works can be high and |
3916 | * concurrency isn't really necessary. Limit it to 1. | 3953 | * concurrency isn't really necessary. Limit it to 1. |
3917 | */ | 3954 | */ |
3918 | EXT4_SB(sb)->dio_unwritten_wq = | 3955 | EXT4_SB(sb)->rsv_conversion_wq = |
3919 | alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); | 3956 | alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); |
3920 | if (!EXT4_SB(sb)->dio_unwritten_wq) { | 3957 | if (!EXT4_SB(sb)->rsv_conversion_wq) { |
3921 | printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); | 3958 | printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); |
3922 | ret = -ENOMEM; | 3959 | ret = -ENOMEM; |
3923 | goto failed_mount_wq; | 3960 | goto failed_mount4; |
3961 | } | ||
3962 | |||
3963 | EXT4_SB(sb)->unrsv_conversion_wq = | ||
3964 | alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); | ||
3965 | if (!EXT4_SB(sb)->unrsv_conversion_wq) { | ||
3966 | printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); | ||
3967 | ret = -ENOMEM; | ||
3968 | goto failed_mount4; | ||
3924 | } | 3969 | } |
3925 | 3970 | ||
3926 | /* | 3971 | /* |
@@ -4074,14 +4119,17 @@ failed_mount4a: | |||
4074 | sb->s_root = NULL; | 4119 | sb->s_root = NULL; |
4075 | failed_mount4: | 4120 | failed_mount4: |
4076 | ext4_msg(sb, KERN_ERR, "mount failed"); | 4121 | ext4_msg(sb, KERN_ERR, "mount failed"); |
4077 | destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); | 4122 | if (EXT4_SB(sb)->rsv_conversion_wq) |
4123 | destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); | ||
4124 | if (EXT4_SB(sb)->unrsv_conversion_wq) | ||
4125 | destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); | ||
4078 | failed_mount_wq: | 4126 | failed_mount_wq: |
4079 | if (sbi->s_journal) { | 4127 | if (sbi->s_journal) { |
4080 | jbd2_journal_destroy(sbi->s_journal); | 4128 | jbd2_journal_destroy(sbi->s_journal); |
4081 | sbi->s_journal = NULL; | 4129 | sbi->s_journal = NULL; |
4082 | } | 4130 | } |
4083 | failed_mount3: | 4131 | failed_mount3: |
4084 | ext4_es_unregister_shrinker(sb); | 4132 | ext4_es_unregister_shrinker(sbi); |
4085 | del_timer(&sbi->s_err_report); | 4133 | del_timer(&sbi->s_err_report); |
4086 | if (sbi->s_flex_groups) | 4134 | if (sbi->s_flex_groups) |
4087 | ext4_kvfree(sbi->s_flex_groups); | 4135 | ext4_kvfree(sbi->s_flex_groups); |
@@ -4517,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait) | |||
4517 | { | 4565 | { |
4518 | int ret = 0; | 4566 | int ret = 0; |
4519 | tid_t target; | 4567 | tid_t target; |
4568 | bool needs_barrier = false; | ||
4520 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 4569 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
4521 | 4570 | ||
4522 | trace_ext4_sync_fs(sb, wait); | 4571 | trace_ext4_sync_fs(sb, wait); |
4523 | flush_workqueue(sbi->dio_unwritten_wq); | 4572 | flush_workqueue(sbi->rsv_conversion_wq); |
4573 | flush_workqueue(sbi->unrsv_conversion_wq); | ||
4524 | /* | 4574 | /* |
4525 | * Writeback quota in non-journalled quota case - journalled quota has | 4575 | * Writeback quota in non-journalled quota case - journalled quota has |
4526 | * no dirty dquots | 4576 | * no dirty dquots |
4527 | */ | 4577 | */ |
4528 | dquot_writeback_dquots(sb, -1); | 4578 | dquot_writeback_dquots(sb, -1); |
4579 | /* | ||
4580 | * Data writeback is possible w/o journal transaction, so barrier must | ||
4581 | * being sent at the end of the function. But we can skip it if | ||
4582 | * transaction_commit will do it for us. | ||
4583 | */ | ||
4584 | target = jbd2_get_latest_transaction(sbi->s_journal); | ||
4585 | if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && | ||
4586 | !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) | ||
4587 | needs_barrier = true; | ||
4588 | |||
4529 | if (jbd2_journal_start_commit(sbi->s_journal, &target)) { | 4589 | if (jbd2_journal_start_commit(sbi->s_journal, &target)) { |
4530 | if (wait) | 4590 | if (wait) |
4531 | jbd2_log_wait_commit(sbi->s_journal, target); | 4591 | ret = jbd2_log_wait_commit(sbi->s_journal, target); |
4592 | } | ||
4593 | if (needs_barrier) { | ||
4594 | int err; | ||
4595 | err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); | ||
4596 | if (!ret) | ||
4597 | ret = err; | ||
4532 | } | 4598 | } |
4599 | |||
4600 | return ret; | ||
4601 | } | ||
4602 | |||
4603 | static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) | ||
4604 | { | ||
4605 | int ret = 0; | ||
4606 | |||
4607 | trace_ext4_sync_fs(sb, wait); | ||
4608 | flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); | ||
4609 | flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); | ||
4610 | dquot_writeback_dquots(sb, -1); | ||
4611 | if (wait && test_opt(sb, BARRIER)) | ||
4612 | ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); | ||
4613 | |||
4533 | return ret; | 4614 | return ret; |
4534 | } | 4615 | } |
4535 | 4616 | ||
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 91ff93b0b0f4..ce11d9a92aed 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c | |||
@@ -698,7 +698,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, | |||
698 | get_data_block_ro); | 698 | get_data_block_ro); |
699 | } | 699 | } |
700 | 700 | ||
701 | static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) | 701 | static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, |
702 | unsigned int length) | ||
702 | { | 703 | { |
703 | struct inode *inode = page->mapping->host; | 704 | struct inode *inode = page->mapping->host; |
704 | struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); | 705 | struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); |
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3df43b4efd89..74f3c7b03eb2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c | |||
@@ -1205,7 +1205,8 @@ static int f2fs_set_node_page_dirty(struct page *page) | |||
1205 | return 0; | 1205 | return 0; |
1206 | } | 1206 | } |
1207 | 1207 | ||
1208 | static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) | 1208 | static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, |
1209 | unsigned int length) | ||
1209 | { | 1210 | { |
1210 | struct inode *inode = page->mapping->host; | 1211 | struct inode *inode = page->mapping->host; |
1211 | struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); | 1212 | struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); |
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 0bad69ed6336..ee48ad37d9c0 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c | |||
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page, | |||
110 | /* Is the page fully outside i_size? (truncate in progress) */ | 110 | /* Is the page fully outside i_size? (truncate in progress) */ |
111 | offset = i_size & (PAGE_CACHE_SIZE-1); | 111 | offset = i_size & (PAGE_CACHE_SIZE-1); |
112 | if (page->index > end_index || (page->index == end_index && !offset)) { | 112 | if (page->index > end_index || (page->index == end_index && !offset)) { |
113 | page->mapping->a_ops->invalidatepage(page, 0); | 113 | page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); |
114 | goto out; | 114 | goto out; |
115 | } | 115 | } |
116 | return 1; | 116 | return 1; |
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, | |||
299 | 299 | ||
300 | /* Is the page fully outside i_size? (truncate in progress) */ | 300 | /* Is the page fully outside i_size? (truncate in progress) */ |
301 | if (page->index > end_index || (page->index == end_index && !offset)) { | 301 | if (page->index > end_index || (page->index == end_index && !offset)) { |
302 | page->mapping->a_ops->invalidatepage(page, 0); | 302 | page->mapping->a_ops->invalidatepage(page, 0, |
303 | PAGE_CACHE_SIZE); | ||
303 | unlock_page(page); | 304 | unlock_page(page); |
304 | continue; | 305 | continue; |
305 | } | 306 | } |
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) | |||
943 | unlock_buffer(bh); | 944 | unlock_buffer(bh); |
944 | } | 945 | } |
945 | 946 | ||
946 | static void gfs2_invalidatepage(struct page *page, unsigned long offset) | 947 | static void gfs2_invalidatepage(struct page *page, unsigned int offset, |
948 | unsigned int length) | ||
947 | { | 949 | { |
948 | struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); | 950 | struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); |
951 | unsigned int stop = offset + length; | ||
952 | int partial_page = (offset || length < PAGE_CACHE_SIZE); | ||
949 | struct buffer_head *bh, *head; | 953 | struct buffer_head *bh, *head; |
950 | unsigned long pos = 0; | 954 | unsigned long pos = 0; |
951 | 955 | ||
952 | BUG_ON(!PageLocked(page)); | 956 | BUG_ON(!PageLocked(page)); |
953 | if (offset == 0) | 957 | if (!partial_page) |
954 | ClearPageChecked(page); | 958 | ClearPageChecked(page); |
955 | if (!page_has_buffers(page)) | 959 | if (!page_has_buffers(page)) |
956 | goto out; | 960 | goto out; |
957 | 961 | ||
958 | bh = head = page_buffers(page); | 962 | bh = head = page_buffers(page); |
959 | do { | 963 | do { |
964 | if (pos + bh->b_size > stop) | ||
965 | return; | ||
966 | |||
960 | if (offset <= pos) | 967 | if (offset <= pos) |
961 | gfs2_discard(sdp, bh); | 968 | gfs2_discard(sdp, bh); |
962 | pos += bh->b_size; | 969 | pos += bh->b_size; |
963 | bh = bh->b_this_page; | 970 | bh = bh->b_this_page; |
964 | } while (bh != head); | 971 | } while (bh != head); |
965 | out: | 972 | out: |
966 | if (offset == 0) | 973 | if (!partial_page) |
967 | try_to_release_page(page, 0); | 974 | try_to_release_page(page, 0); |
968 | } | 975 | } |
969 | 976 | ||
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index e3e255c0a509..be0c39b66fe0 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c | |||
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked: | |||
2019 | * void journal_invalidatepage() - invalidate a journal page | 2019 | * void journal_invalidatepage() - invalidate a journal page |
2020 | * @journal: journal to use for flush | 2020 | * @journal: journal to use for flush |
2021 | * @page: page to flush | 2021 | * @page: page to flush |
2022 | * @offset: length of page to invalidate. | 2022 | * @offset: offset of the range to invalidate |
2023 | * @length: length of the range to invalidate | ||
2023 | * | 2024 | * |
2024 | * Reap page buffers containing data after offset in page. | 2025 | * Reap page buffers containing data in specified range in page. |
2025 | */ | 2026 | */ |
2026 | void journal_invalidatepage(journal_t *journal, | 2027 | void journal_invalidatepage(journal_t *journal, |
2027 | struct page *page, | 2028 | struct page *page, |
2028 | unsigned long offset) | 2029 | unsigned int offset, |
2030 | unsigned int length) | ||
2029 | { | 2031 | { |
2030 | struct buffer_head *head, *bh, *next; | 2032 | struct buffer_head *head, *bh, *next; |
2033 | unsigned int stop = offset + length; | ||
2031 | unsigned int curr_off = 0; | 2034 | unsigned int curr_off = 0; |
2035 | int partial_page = (offset || length < PAGE_CACHE_SIZE); | ||
2032 | int may_free = 1; | 2036 | int may_free = 1; |
2033 | 2037 | ||
2034 | if (!PageLocked(page)) | 2038 | if (!PageLocked(page)) |
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal, | |||
2036 | if (!page_has_buffers(page)) | 2040 | if (!page_has_buffers(page)) |
2037 | return; | 2041 | return; |
2038 | 2042 | ||
2043 | BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); | ||
2044 | |||
2039 | /* We will potentially be playing with lists other than just the | 2045 | /* We will potentially be playing with lists other than just the |
2040 | * data lists (especially for journaled data mode), so be | 2046 | * data lists (especially for journaled data mode), so be |
2041 | * cautious in our locking. */ | 2047 | * cautious in our locking. */ |
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal, | |||
2045 | unsigned int next_off = curr_off + bh->b_size; | 2051 | unsigned int next_off = curr_off + bh->b_size; |
2046 | next = bh->b_this_page; | 2052 | next = bh->b_this_page; |
2047 | 2053 | ||
2054 | if (next_off > stop) | ||
2055 | return; | ||
2056 | |||
2048 | if (offset <= curr_off) { | 2057 | if (offset <= curr_off) { |
2049 | /* This block is wholly outside the truncation point */ | 2058 | /* This block is wholly outside the truncation point */ |
2050 | lock_buffer(bh); | 2059 | lock_buffer(bh); |
2051 | may_free &= journal_unmap_buffer(journal, bh, | 2060 | may_free &= journal_unmap_buffer(journal, bh, |
2052 | offset > 0); | 2061 | partial_page); |
2053 | unlock_buffer(bh); | 2062 | unlock_buffer(bh); |
2054 | } | 2063 | } |
2055 | curr_off = next_off; | 2064 | curr_off = next_off; |
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal, | |||
2057 | 2066 | ||
2058 | } while (bh != head); | 2067 | } while (bh != head); |
2059 | 2068 | ||
2060 | if (!offset) { | 2069 | if (!partial_page) { |
2061 | if (may_free && try_to_free_buffers(page)) | 2070 | if (may_free && try_to_free_buffers(page)) |
2062 | J_ASSERT(!page_has_buffers(page)); | 2071 | J_ASSERT(!page_has_buffers(page)); |
2063 | } | 2072 | } |
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index 69a48c2944da..5a9f5534d57b 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig | |||
@@ -20,7 +20,7 @@ config JBD2 | |||
20 | 20 | ||
21 | config JBD2_DEBUG | 21 | config JBD2_DEBUG |
22 | bool "JBD2 (ext4) debugging support" | 22 | bool "JBD2 (ext4) debugging support" |
23 | depends on JBD2 && DEBUG_FS | 23 | depends on JBD2 |
24 | help | 24 | help |
25 | If you are using the ext4 journaled file system (or | 25 | If you are using the ext4 journaled file system (or |
26 | potentially any other filesystem/device using JBD2), this option | 26 | potentially any other filesystem/device using JBD2), this option |
@@ -29,7 +29,7 @@ config JBD2_DEBUG | |||
29 | By default, the debugging output will be turned off. | 29 | By default, the debugging output will be turned off. |
30 | 30 | ||
31 | If you select Y here, then you will be able to turn on debugging | 31 | If you select Y here, then you will be able to turn on debugging |
32 | with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a | 32 | with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a |
33 | number between 1 and 5. The higher the number, the more debugging | 33 | number between 1 and 5. The higher the number, the more debugging |
34 | output is generated. To turn debugging off again, do | 34 | output is generated. To turn debugging off again, do |
35 | "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". | 35 | "echo 0 > /sys/module/jbd2/parameters/jbd2_debug". |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c78841ee81cf..7f34f4716165 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal) | |||
120 | int nblocks, space_left; | 120 | int nblocks, space_left; |
121 | /* assert_spin_locked(&journal->j_state_lock); */ | 121 | /* assert_spin_locked(&journal->j_state_lock); */ |
122 | 122 | ||
123 | nblocks = jbd_space_needed(journal); | 123 | nblocks = jbd2_space_needed(journal); |
124 | while (__jbd2_log_space_left(journal) < nblocks) { | 124 | while (jbd2_log_space_left(journal) < nblocks) { |
125 | if (journal->j_flags & JBD2_ABORT) | 125 | if (journal->j_flags & JBD2_ABORT) |
126 | return; | 126 | return; |
127 | write_unlock(&journal->j_state_lock); | 127 | write_unlock(&journal->j_state_lock); |
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal) | |||
140 | */ | 140 | */ |
141 | write_lock(&journal->j_state_lock); | 141 | write_lock(&journal->j_state_lock); |
142 | spin_lock(&journal->j_list_lock); | 142 | spin_lock(&journal->j_list_lock); |
143 | nblocks = jbd_space_needed(journal); | 143 | nblocks = jbd2_space_needed(journal); |
144 | space_left = __jbd2_log_space_left(journal); | 144 | space_left = jbd2_log_space_left(journal); |
145 | if (space_left < nblocks) { | 145 | if (space_left < nblocks) { |
146 | int chkpt = journal->j_checkpoint_transactions != NULL; | 146 | int chkpt = journal->j_checkpoint_transactions != NULL; |
147 | tid_t tid = 0; | 147 | tid_t tid = 0; |
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal) | |||
156 | /* We were able to recover space; yay! */ | 156 | /* We were able to recover space; yay! */ |
157 | ; | 157 | ; |
158 | } else if (tid) { | 158 | } else if (tid) { |
159 | /* | ||
160 | * jbd2_journal_commit_transaction() may want | ||
161 | * to take the checkpoint_mutex if JBD2_FLUSHED | ||
162 | * is set. So we need to temporarily drop it. | ||
163 | */ | ||
164 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
159 | jbd2_log_wait_commit(journal, tid); | 165 | jbd2_log_wait_commit(journal, tid); |
166 | write_lock(&journal->j_state_lock); | ||
167 | continue; | ||
160 | } else { | 168 | } else { |
161 | printk(KERN_ERR "%s: needed %d blocks and " | 169 | printk(KERN_ERR "%s: needed %d blocks and " |
162 | "only had %d space available\n", | 170 | "only had %d space available\n", |
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) | |||
625 | 633 | ||
626 | __jbd2_journal_drop_transaction(journal, transaction); | 634 | __jbd2_journal_drop_transaction(journal, transaction); |
627 | jbd2_journal_free_transaction(transaction); | 635 | jbd2_journal_free_transaction(transaction); |
628 | |||
629 | /* Just in case anybody was waiting for more transactions to be | ||
630 | checkpointed... */ | ||
631 | wake_up(&journal->j_wait_logspace); | ||
632 | ret = 1; | 636 | ret = 1; |
633 | out: | 637 | out: |
634 | return ret; | 638 | return ret; |
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact | |||
690 | J_ASSERT(transaction->t_state == T_FINISHED); | 694 | J_ASSERT(transaction->t_state == T_FINISHED); |
691 | J_ASSERT(transaction->t_buffers == NULL); | 695 | J_ASSERT(transaction->t_buffers == NULL); |
692 | J_ASSERT(transaction->t_forget == NULL); | 696 | J_ASSERT(transaction->t_forget == NULL); |
693 | J_ASSERT(transaction->t_iobuf_list == NULL); | ||
694 | J_ASSERT(transaction->t_shadow_list == NULL); | 697 | J_ASSERT(transaction->t_shadow_list == NULL); |
695 | J_ASSERT(transaction->t_log_list == NULL); | ||
696 | J_ASSERT(transaction->t_checkpoint_list == NULL); | 698 | J_ASSERT(transaction->t_checkpoint_list == NULL); |
697 | J_ASSERT(transaction->t_checkpoint_io_list == NULL); | 699 | J_ASSERT(transaction->t_checkpoint_io_list == NULL); |
698 | J_ASSERT(atomic_read(&transaction->t_updates) == 0); | 700 | J_ASSERT(atomic_read(&transaction->t_updates) == 0); |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0f53946f13c1..559bec1a37b4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -30,15 +30,22 @@ | |||
30 | #include <trace/events/jbd2.h> | 30 | #include <trace/events/jbd2.h> |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Default IO end handler for temporary BJ_IO buffer_heads. | 33 | * IO end handler for temporary buffer_heads handling writes to the journal. |
34 | */ | 34 | */ |
35 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | 35 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) |
36 | { | 36 | { |
37 | struct buffer_head *orig_bh = bh->b_private; | ||
38 | |||
37 | BUFFER_TRACE(bh, ""); | 39 | BUFFER_TRACE(bh, ""); |
38 | if (uptodate) | 40 | if (uptodate) |
39 | set_buffer_uptodate(bh); | 41 | set_buffer_uptodate(bh); |
40 | else | 42 | else |
41 | clear_buffer_uptodate(bh); | 43 | clear_buffer_uptodate(bh); |
44 | if (orig_bh) { | ||
45 | clear_bit_unlock(BH_Shadow, &orig_bh->b_state); | ||
46 | smp_mb__after_clear_bit(); | ||
47 | wake_up_bit(&orig_bh->b_state, BH_Shadow); | ||
48 | } | ||
42 | unlock_buffer(bh); | 49 | unlock_buffer(bh); |
43 | } | 50 | } |
44 | 51 | ||
@@ -85,8 +92,7 @@ nope: | |||
85 | __brelse(bh); | 92 | __brelse(bh); |
86 | } | 93 | } |
87 | 94 | ||
88 | static void jbd2_commit_block_csum_set(journal_t *j, | 95 | static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) |
89 | struct journal_head *descriptor) | ||
90 | { | 96 | { |
91 | struct commit_header *h; | 97 | struct commit_header *h; |
92 | __u32 csum; | 98 | __u32 csum; |
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j, | |||
94 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) | 100 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) |
95 | return; | 101 | return; |
96 | 102 | ||
97 | h = (struct commit_header *)(jh2bh(descriptor)->b_data); | 103 | h = (struct commit_header *)(bh->b_data); |
98 | h->h_chksum_type = 0; | 104 | h->h_chksum_type = 0; |
99 | h->h_chksum_size = 0; | 105 | h->h_chksum_size = 0; |
100 | h->h_chksum[0] = 0; | 106 | h->h_chksum[0] = 0; |
101 | csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, | 107 | csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); |
102 | j->j_blocksize); | ||
103 | h->h_chksum[0] = cpu_to_be32(csum); | 108 | h->h_chksum[0] = cpu_to_be32(csum); |
104 | } | 109 | } |
105 | 110 | ||
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal, | |||
116 | struct buffer_head **cbh, | 121 | struct buffer_head **cbh, |
117 | __u32 crc32_sum) | 122 | __u32 crc32_sum) |
118 | { | 123 | { |
119 | struct journal_head *descriptor; | ||
120 | struct commit_header *tmp; | 124 | struct commit_header *tmp; |
121 | struct buffer_head *bh; | 125 | struct buffer_head *bh; |
122 | int ret; | 126 | int ret; |
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal, | |||
127 | if (is_journal_aborted(journal)) | 131 | if (is_journal_aborted(journal)) |
128 | return 0; | 132 | return 0; |
129 | 133 | ||
130 | descriptor = jbd2_journal_get_descriptor_buffer(journal); | 134 | bh = jbd2_journal_get_descriptor_buffer(journal); |
131 | if (!descriptor) | 135 | if (!bh) |
132 | return 1; | 136 | return 1; |
133 | 137 | ||
134 | bh = jh2bh(descriptor); | ||
135 | |||
136 | tmp = (struct commit_header *)bh->b_data; | 138 | tmp = (struct commit_header *)bh->b_data; |
137 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 139 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
138 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | 140 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); |
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal, | |||
146 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; | 148 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; |
147 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); | 149 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); |
148 | } | 150 | } |
149 | jbd2_commit_block_csum_set(journal, descriptor); | 151 | jbd2_commit_block_csum_set(journal, bh); |
150 | 152 | ||
151 | JBUFFER_TRACE(descriptor, "submit commit block"); | 153 | BUFFER_TRACE(bh, "submit commit block"); |
152 | lock_buffer(bh); | 154 | lock_buffer(bh); |
153 | clear_buffer_dirty(bh); | 155 | clear_buffer_dirty(bh); |
154 | set_buffer_uptodate(bh); | 156 | set_buffer_uptodate(bh); |
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal, | |||
180 | if (unlikely(!buffer_uptodate(bh))) | 182 | if (unlikely(!buffer_uptodate(bh))) |
181 | ret = -EIO; | 183 | ret = -EIO; |
182 | put_bh(bh); /* One for getblk() */ | 184 | put_bh(bh); /* One for getblk() */ |
183 | jbd2_journal_put_journal_head(bh2jh(bh)); | ||
184 | 185 | ||
185 | return ret; | 186 | return ret; |
186 | } | 187 | } |
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | |||
321 | } | 322 | } |
322 | 323 | ||
323 | static void jbd2_descr_block_csum_set(journal_t *j, | 324 | static void jbd2_descr_block_csum_set(journal_t *j, |
324 | struct journal_head *descriptor) | 325 | struct buffer_head *bh) |
325 | { | 326 | { |
326 | struct jbd2_journal_block_tail *tail; | 327 | struct jbd2_journal_block_tail *tail; |
327 | __u32 csum; | 328 | __u32 csum; |
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j, | |||
329 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) | 330 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) |
330 | return; | 331 | return; |
331 | 332 | ||
332 | tail = (struct jbd2_journal_block_tail *) | 333 | tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - |
333 | (jh2bh(descriptor)->b_data + j->j_blocksize - | ||
334 | sizeof(struct jbd2_journal_block_tail)); | 334 | sizeof(struct jbd2_journal_block_tail)); |
335 | tail->t_checksum = 0; | 335 | tail->t_checksum = 0; |
336 | csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, | 336 | csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); |
337 | j->j_blocksize); | ||
338 | tail->t_checksum = cpu_to_be32(csum); | 337 | tail->t_checksum = cpu_to_be32(csum); |
339 | } | 338 | } |
340 | 339 | ||
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, | |||
343 | { | 342 | { |
344 | struct page *page = bh->b_page; | 343 | struct page *page = bh->b_page; |
345 | __u8 *addr; | 344 | __u8 *addr; |
346 | __u32 csum; | 345 | __u32 csum32; |
347 | 346 | ||
348 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) | 347 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) |
349 | return; | 348 | return; |
350 | 349 | ||
351 | sequence = cpu_to_be32(sequence); | 350 | sequence = cpu_to_be32(sequence); |
352 | addr = kmap_atomic(page); | 351 | addr = kmap_atomic(page); |
353 | csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, | 352 | csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, |
354 | sizeof(sequence)); | 353 | sizeof(sequence)); |
355 | csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), | 354 | csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), |
356 | bh->b_size); | 355 | bh->b_size); |
357 | kunmap_atomic(addr); | 356 | kunmap_atomic(addr); |
358 | 357 | ||
359 | tag->t_checksum = cpu_to_be32(csum); | 358 | /* We only have space to store the lower 16 bits of the crc32c. */ |
359 | tag->t_checksum = cpu_to_be16(csum32); | ||
360 | } | 360 | } |
361 | /* | 361 | /* |
362 | * jbd2_journal_commit_transaction | 362 | * jbd2_journal_commit_transaction |
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
368 | { | 368 | { |
369 | struct transaction_stats_s stats; | 369 | struct transaction_stats_s stats; |
370 | transaction_t *commit_transaction; | 370 | transaction_t *commit_transaction; |
371 | struct journal_head *jh, *new_jh, *descriptor; | 371 | struct journal_head *jh; |
372 | struct buffer_head *descriptor; | ||
372 | struct buffer_head **wbuf = journal->j_wbuf; | 373 | struct buffer_head **wbuf = journal->j_wbuf; |
373 | int bufs; | 374 | int bufs; |
374 | int flags; | 375 | int flags; |
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
392 | tid_t first_tid; | 393 | tid_t first_tid; |
393 | int update_tail; | 394 | int update_tail; |
394 | int csum_size = 0; | 395 | int csum_size = 0; |
396 | LIST_HEAD(io_bufs); | ||
397 | LIST_HEAD(log_bufs); | ||
395 | 398 | ||
396 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) | 399 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) |
397 | csum_size = sizeof(struct jbd2_journal_block_tail); | 400 | csum_size = sizeof(struct jbd2_journal_block_tail); |
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
424 | J_ASSERT(journal->j_committing_transaction == NULL); | 427 | J_ASSERT(journal->j_committing_transaction == NULL); |
425 | 428 | ||
426 | commit_transaction = journal->j_running_transaction; | 429 | commit_transaction = journal->j_running_transaction; |
427 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | ||
428 | 430 | ||
429 | trace_jbd2_start_commit(journal, commit_transaction); | 431 | trace_jbd2_start_commit(journal, commit_transaction); |
430 | jbd_debug(1, "JBD2: starting commit of transaction %d\n", | 432 | jbd_debug(1, "JBD2: starting commit of transaction %d\n", |
431 | commit_transaction->t_tid); | 433 | commit_transaction->t_tid); |
432 | 434 | ||
433 | write_lock(&journal->j_state_lock); | 435 | write_lock(&journal->j_state_lock); |
436 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | ||
434 | commit_transaction->t_state = T_LOCKED; | 437 | commit_transaction->t_state = T_LOCKED; |
435 | 438 | ||
436 | trace_jbd2_commit_locking(journal, commit_transaction); | 439 | trace_jbd2_commit_locking(journal, commit_transaction); |
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
520 | */ | 523 | */ |
521 | jbd2_journal_switch_revoke_table(journal); | 524 | jbd2_journal_switch_revoke_table(journal); |
522 | 525 | ||
526 | /* | ||
527 | * Reserved credits cannot be claimed anymore, free them | ||
528 | */ | ||
529 | atomic_sub(atomic_read(&journal->j_reserved_credits), | ||
530 | &commit_transaction->t_outstanding_credits); | ||
531 | |||
523 | trace_jbd2_commit_flushing(journal, commit_transaction); | 532 | trace_jbd2_commit_flushing(journal, commit_transaction); |
524 | stats.run.rs_flushing = jiffies; | 533 | stats.run.rs_flushing = jiffies; |
525 | stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, | 534 | stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, |
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
533 | wake_up(&journal->j_wait_transaction_locked); | 542 | wake_up(&journal->j_wait_transaction_locked); |
534 | write_unlock(&journal->j_state_lock); | 543 | write_unlock(&journal->j_state_lock); |
535 | 544 | ||
536 | jbd_debug(3, "JBD2: commit phase 2\n"); | 545 | jbd_debug(3, "JBD2: commit phase 2a\n"); |
537 | 546 | ||
538 | /* | 547 | /* |
539 | * Now start flushing things to disk, in the order they appear | 548 | * Now start flushing things to disk, in the order they appear |
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
545 | 554 | ||
546 | blk_start_plug(&plug); | 555 | blk_start_plug(&plug); |
547 | jbd2_journal_write_revoke_records(journal, commit_transaction, | 556 | jbd2_journal_write_revoke_records(journal, commit_transaction, |
548 | WRITE_SYNC); | 557 | &log_bufs, WRITE_SYNC); |
549 | blk_finish_plug(&plug); | 558 | blk_finish_plug(&plug); |
550 | 559 | ||
551 | jbd_debug(3, "JBD2: commit phase 2\n"); | 560 | jbd_debug(3, "JBD2: commit phase 2b\n"); |
552 | 561 | ||
553 | /* | 562 | /* |
554 | * Way to go: we have now written out all of the data for a | 563 | * Way to go: we have now written out all of the data for a |
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
571 | atomic_read(&commit_transaction->t_outstanding_credits)); | 580 | atomic_read(&commit_transaction->t_outstanding_credits)); |
572 | 581 | ||
573 | err = 0; | 582 | err = 0; |
574 | descriptor = NULL; | ||
575 | bufs = 0; | 583 | bufs = 0; |
584 | descriptor = NULL; | ||
576 | blk_start_plug(&plug); | 585 | blk_start_plug(&plug); |
577 | while (commit_transaction->t_buffers) { | 586 | while (commit_transaction->t_buffers) { |
578 | 587 | ||
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
604 | record the metadata buffer. */ | 613 | record the metadata buffer. */ |
605 | 614 | ||
606 | if (!descriptor) { | 615 | if (!descriptor) { |
607 | struct buffer_head *bh; | ||
608 | |||
609 | J_ASSERT (bufs == 0); | 616 | J_ASSERT (bufs == 0); |
610 | 617 | ||
611 | jbd_debug(4, "JBD2: get descriptor\n"); | 618 | jbd_debug(4, "JBD2: get descriptor\n"); |
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
616 | continue; | 623 | continue; |
617 | } | 624 | } |
618 | 625 | ||
619 | bh = jh2bh(descriptor); | ||
620 | jbd_debug(4, "JBD2: got buffer %llu (%p)\n", | 626 | jbd_debug(4, "JBD2: got buffer %llu (%p)\n", |
621 | (unsigned long long)bh->b_blocknr, bh->b_data); | 627 | (unsigned long long)descriptor->b_blocknr, |
622 | header = (journal_header_t *)&bh->b_data[0]; | 628 | descriptor->b_data); |
629 | header = (journal_header_t *)descriptor->b_data; | ||
623 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 630 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
624 | header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); | 631 | header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); |
625 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 632 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
626 | 633 | ||
627 | tagp = &bh->b_data[sizeof(journal_header_t)]; | 634 | tagp = &descriptor->b_data[sizeof(journal_header_t)]; |
628 | space_left = bh->b_size - sizeof(journal_header_t); | 635 | space_left = descriptor->b_size - |
636 | sizeof(journal_header_t); | ||
629 | first_tag = 1; | 637 | first_tag = 1; |
630 | set_buffer_jwrite(bh); | 638 | set_buffer_jwrite(descriptor); |
631 | set_buffer_dirty(bh); | 639 | set_buffer_dirty(descriptor); |
632 | wbuf[bufs++] = bh; | 640 | wbuf[bufs++] = descriptor; |
633 | 641 | ||
634 | /* Record it so that we can wait for IO | 642 | /* Record it so that we can wait for IO |
635 | completion later */ | 643 | completion later */ |
636 | BUFFER_TRACE(bh, "ph3: file as descriptor"); | 644 | BUFFER_TRACE(descriptor, "ph3: file as descriptor"); |
637 | jbd2_journal_file_buffer(descriptor, commit_transaction, | 645 | jbd2_file_log_bh(&log_bufs, descriptor); |
638 | BJ_LogCtl); | ||
639 | } | 646 | } |
640 | 647 | ||
641 | /* Where is the buffer to be written? */ | 648 | /* Where is the buffer to be written? */ |
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
658 | 665 | ||
659 | /* Bump b_count to prevent truncate from stumbling over | 666 | /* Bump b_count to prevent truncate from stumbling over |
660 | the shadowed buffer! @@@ This can go if we ever get | 667 | the shadowed buffer! @@@ This can go if we ever get |
661 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ | 668 | rid of the shadow pairing of buffers. */ |
662 | atomic_inc(&jh2bh(jh)->b_count); | 669 | atomic_inc(&jh2bh(jh)->b_count); |
663 | 670 | ||
664 | /* Make a temporary IO buffer with which to write it out | ||
665 | (this will requeue both the metadata buffer and the | ||
666 | temporary IO buffer). new_bh goes on BJ_IO*/ | ||
667 | |||
668 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); | ||
669 | /* | 671 | /* |
670 | * akpm: jbd2_journal_write_metadata_buffer() sets | 672 | * Make a temporary IO buffer with which to write it out |
671 | * new_bh->b_transaction to commit_transaction. | 673 | * (this will requeue the metadata buffer to BJ_Shadow). |
672 | * We need to clean this up before we release new_bh | ||
673 | * (which is of type BJ_IO) | ||
674 | */ | 674 | */ |
675 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); | ||
675 | JBUFFER_TRACE(jh, "ph3: write metadata"); | 676 | JBUFFER_TRACE(jh, "ph3: write metadata"); |
676 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, | 677 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, |
677 | jh, &new_jh, blocknr); | 678 | jh, &wbuf[bufs], blocknr); |
678 | if (flags < 0) { | 679 | if (flags < 0) { |
679 | jbd2_journal_abort(journal, flags); | 680 | jbd2_journal_abort(journal, flags); |
680 | continue; | 681 | continue; |
681 | } | 682 | } |
682 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); | 683 | jbd2_file_log_bh(&io_bufs, wbuf[bufs]); |
683 | wbuf[bufs++] = jh2bh(new_jh); | ||
684 | 684 | ||
685 | /* Record the new block's tag in the current descriptor | 685 | /* Record the new block's tag in the current descriptor |
686 | buffer */ | 686 | buffer */ |
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
694 | tag = (journal_block_tag_t *) tagp; | 694 | tag = (journal_block_tag_t *) tagp; |
695 | write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); | 695 | write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); |
696 | tag->t_flags = cpu_to_be16(tag_flag); | 696 | tag->t_flags = cpu_to_be16(tag_flag); |
697 | jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), | 697 | jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], |
698 | commit_transaction->t_tid); | 698 | commit_transaction->t_tid); |
699 | tagp += tag_bytes; | 699 | tagp += tag_bytes; |
700 | space_left -= tag_bytes; | 700 | space_left -= tag_bytes; |
701 | bufs++; | ||
701 | 702 | ||
702 | if (first_tag) { | 703 | if (first_tag) { |
703 | memcpy (tagp, journal->j_uuid, 16); | 704 | memcpy (tagp, journal->j_uuid, 16); |
@@ -809,7 +810,7 @@ start_journal_io: | |||
809 | the log. Before we can commit it, wait for the IO so far to | 810 | the log. Before we can commit it, wait for the IO so far to |
810 | complete. Control buffers being written are on the | 811 | complete. Control buffers being written are on the |
811 | transaction's t_log_list queue, and metadata buffers are on | 812 | transaction's t_log_list queue, and metadata buffers are on |
812 | the t_iobuf_list queue. | 813 | the io_bufs list. |
813 | 814 | ||
814 | Wait for the buffers in reverse order. That way we are | 815 | Wait for the buffers in reverse order. That way we are |
815 | less likely to be woken up until all IOs have completed, and | 816 | less likely to be woken up until all IOs have completed, and |
@@ -818,47 +819,33 @@ start_journal_io: | |||
818 | 819 | ||
819 | jbd_debug(3, "JBD2: commit phase 3\n"); | 820 | jbd_debug(3, "JBD2: commit phase 3\n"); |
820 | 821 | ||
821 | /* | 822 | while (!list_empty(&io_bufs)) { |
822 | * akpm: these are BJ_IO, and j_list_lock is not needed. | 823 | struct buffer_head *bh = list_entry(io_bufs.prev, |
823 | * See __journal_try_to_free_buffer. | 824 | struct buffer_head, |
824 | */ | 825 | b_assoc_buffers); |
825 | wait_for_iobuf: | ||
826 | while (commit_transaction->t_iobuf_list != NULL) { | ||
827 | struct buffer_head *bh; | ||
828 | 826 | ||
829 | jh = commit_transaction->t_iobuf_list->b_tprev; | 827 | wait_on_buffer(bh); |
830 | bh = jh2bh(jh); | 828 | cond_resched(); |
831 | if (buffer_locked(bh)) { | ||
832 | wait_on_buffer(bh); | ||
833 | goto wait_for_iobuf; | ||
834 | } | ||
835 | if (cond_resched()) | ||
836 | goto wait_for_iobuf; | ||
837 | 829 | ||
838 | if (unlikely(!buffer_uptodate(bh))) | 830 | if (unlikely(!buffer_uptodate(bh))) |
839 | err = -EIO; | 831 | err = -EIO; |
840 | 832 | jbd2_unfile_log_bh(bh); | |
841 | clear_buffer_jwrite(bh); | ||
842 | |||
843 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); | ||
844 | jbd2_journal_unfile_buffer(journal, jh); | ||
845 | 833 | ||
846 | /* | 834 | /* |
847 | * ->t_iobuf_list should contain only dummy buffer_heads | 835 | * The list contains temporary buffer heads created by |
848 | * which were created by jbd2_journal_write_metadata_buffer(). | 836 | * jbd2_journal_write_metadata_buffer(). |
849 | */ | 837 | */ |
850 | BUFFER_TRACE(bh, "dumping temporary bh"); | 838 | BUFFER_TRACE(bh, "dumping temporary bh"); |
851 | jbd2_journal_put_journal_head(jh); | ||
852 | __brelse(bh); | 839 | __brelse(bh); |
853 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); | 840 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); |
854 | free_buffer_head(bh); | 841 | free_buffer_head(bh); |
855 | 842 | ||
856 | /* We also have to unlock and free the corresponding | 843 | /* We also have to refile the corresponding shadowed buffer */ |
857 | shadowed buffer */ | ||
858 | jh = commit_transaction->t_shadow_list->b_tprev; | 844 | jh = commit_transaction->t_shadow_list->b_tprev; |
859 | bh = jh2bh(jh); | 845 | bh = jh2bh(jh); |
860 | clear_bit(BH_JWrite, &bh->b_state); | 846 | clear_buffer_jwrite(bh); |
861 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | 847 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); |
848 | J_ASSERT_BH(bh, !buffer_shadow(bh)); | ||
862 | 849 | ||
863 | /* The metadata is now released for reuse, but we need | 850 | /* The metadata is now released for reuse, but we need |
864 | to remember it against this transaction so that when | 851 | to remember it against this transaction so that when |
@@ -866,14 +853,6 @@ wait_for_iobuf: | |||
866 | required. */ | 853 | required. */ |
867 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | 854 | JBUFFER_TRACE(jh, "file as BJ_Forget"); |
868 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); | 855 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); |
869 | /* | ||
870 | * Wake up any transactions which were waiting for this IO to | ||
871 | * complete. The barrier must be here so that changes by | ||
872 | * jbd2_journal_file_buffer() take effect before wake_up_bit() | ||
873 | * does the waitqueue check. | ||
874 | */ | ||
875 | smp_mb(); | ||
876 | wake_up_bit(&bh->b_state, BH_Unshadow); | ||
877 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | 856 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); |
878 | __brelse(bh); | 857 | __brelse(bh); |
879 | } | 858 | } |
@@ -883,26 +862,19 @@ wait_for_iobuf: | |||
883 | jbd_debug(3, "JBD2: commit phase 4\n"); | 862 | jbd_debug(3, "JBD2: commit phase 4\n"); |
884 | 863 | ||
885 | /* Here we wait for the revoke record and descriptor record buffers */ | 864 | /* Here we wait for the revoke record and descriptor record buffers */ |
886 | wait_for_ctlbuf: | 865 | while (!list_empty(&log_bufs)) { |
887 | while (commit_transaction->t_log_list != NULL) { | ||
888 | struct buffer_head *bh; | 866 | struct buffer_head *bh; |
889 | 867 | ||
890 | jh = commit_transaction->t_log_list->b_tprev; | 868 | bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); |
891 | bh = jh2bh(jh); | 869 | wait_on_buffer(bh); |
892 | if (buffer_locked(bh)) { | 870 | cond_resched(); |
893 | wait_on_buffer(bh); | ||
894 | goto wait_for_ctlbuf; | ||
895 | } | ||
896 | if (cond_resched()) | ||
897 | goto wait_for_ctlbuf; | ||
898 | 871 | ||
899 | if (unlikely(!buffer_uptodate(bh))) | 872 | if (unlikely(!buffer_uptodate(bh))) |
900 | err = -EIO; | 873 | err = -EIO; |
901 | 874 | ||
902 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); | 875 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); |
903 | clear_buffer_jwrite(bh); | 876 | clear_buffer_jwrite(bh); |
904 | jbd2_journal_unfile_buffer(journal, jh); | 877 | jbd2_unfile_log_bh(bh); |
905 | jbd2_journal_put_journal_head(jh); | ||
906 | __brelse(bh); /* One for getblk */ | 878 | __brelse(bh); /* One for getblk */ |
907 | /* AKPM: bforget here */ | 879 | /* AKPM: bforget here */ |
908 | } | 880 | } |
@@ -952,9 +924,7 @@ wait_for_iobuf: | |||
952 | J_ASSERT(list_empty(&commit_transaction->t_inode_list)); | 924 | J_ASSERT(list_empty(&commit_transaction->t_inode_list)); |
953 | J_ASSERT(commit_transaction->t_buffers == NULL); | 925 | J_ASSERT(commit_transaction->t_buffers == NULL); |
954 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | 926 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); |
955 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | ||
956 | J_ASSERT(commit_transaction->t_shadow_list == NULL); | 927 | J_ASSERT(commit_transaction->t_shadow_list == NULL); |
957 | J_ASSERT(commit_transaction->t_log_list == NULL); | ||
958 | 928 | ||
959 | restart_loop: | 929 | restart_loop: |
960 | /* | 930 | /* |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 95457576e434..02c7ad9d7a41 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache); | |||
103 | static void __journal_abort_soft (journal_t *journal, int errno); | 103 | static void __journal_abort_soft (journal_t *journal, int errno); |
104 | static int jbd2_journal_create_slab(size_t slab_size); | 104 | static int jbd2_journal_create_slab(size_t slab_size); |
105 | 105 | ||
106 | #ifdef CONFIG_JBD2_DEBUG | ||
107 | void __jbd2_debug(int level, const char *file, const char *func, | ||
108 | unsigned int line, const char *fmt, ...) | ||
109 | { | ||
110 | struct va_format vaf; | ||
111 | va_list args; | ||
112 | |||
113 | if (level > jbd2_journal_enable_debug) | ||
114 | return; | ||
115 | va_start(args, fmt); | ||
116 | vaf.fmt = fmt; | ||
117 | vaf.va = &args; | ||
118 | printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); | ||
119 | va_end(args); | ||
120 | } | ||
121 | EXPORT_SYMBOL(__jbd2_debug); | ||
122 | #endif | ||
123 | |||
106 | /* Checksumming functions */ | 124 | /* Checksumming functions */ |
107 | int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) | 125 | int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) |
108 | { | 126 | { |
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal) | |||
310 | * | 328 | * |
311 | * If the source buffer has already been modified by a new transaction | 329 | * If the source buffer has already been modified by a new transaction |
312 | * since we took the last commit snapshot, we use the frozen copy of | 330 | * since we took the last commit snapshot, we use the frozen copy of |
313 | * that data for IO. If we end up using the existing buffer_head's data | 331 | * that data for IO. If we end up using the existing buffer_head's data |
314 | * for the write, then we *have* to lock the buffer to prevent anyone | 332 | * for the write, then we have to make sure nobody modifies it while the |
315 | * else from using and possibly modifying it while the IO is in | 333 | * IO is in progress. do_get_write_access() handles this. |
316 | * progress. | ||
317 | * | 334 | * |
318 | * The function returns a pointer to the buffer_heads to be used for IO. | 335 | * The function returns a pointer to the buffer_head to be used for IO. |
319 | * | 336 | * |
320 | * We assume that the journal has already been locked in this function. | ||
321 | * | 337 | * |
322 | * Return value: | 338 | * Return value: |
323 | * <0: Error | 339 | * <0: Error |
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal) | |||
330 | 346 | ||
331 | int jbd2_journal_write_metadata_buffer(transaction_t *transaction, | 347 | int jbd2_journal_write_metadata_buffer(transaction_t *transaction, |
332 | struct journal_head *jh_in, | 348 | struct journal_head *jh_in, |
333 | struct journal_head **jh_out, | 349 | struct buffer_head **bh_out, |
334 | unsigned long long blocknr) | 350 | sector_t blocknr) |
335 | { | 351 | { |
336 | int need_copy_out = 0; | 352 | int need_copy_out = 0; |
337 | int done_copy_out = 0; | 353 | int done_copy_out = 0; |
338 | int do_escape = 0; | 354 | int do_escape = 0; |
339 | char *mapped_data; | 355 | char *mapped_data; |
340 | struct buffer_head *new_bh; | 356 | struct buffer_head *new_bh; |
341 | struct journal_head *new_jh; | ||
342 | struct page *new_page; | 357 | struct page *new_page; |
343 | unsigned int new_offset; | 358 | unsigned int new_offset; |
344 | struct buffer_head *bh_in = jh2bh(jh_in); | 359 | struct buffer_head *bh_in = jh2bh(jh_in); |
@@ -368,14 +383,13 @@ retry_alloc: | |||
368 | 383 | ||
369 | /* keep subsequent assertions sane */ | 384 | /* keep subsequent assertions sane */ |
370 | atomic_set(&new_bh->b_count, 1); | 385 | atomic_set(&new_bh->b_count, 1); |
371 | new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ | ||
372 | 386 | ||
387 | jbd_lock_bh_state(bh_in); | ||
388 | repeat: | ||
373 | /* | 389 | /* |
374 | * If a new transaction has already done a buffer copy-out, then | 390 | * If a new transaction has already done a buffer copy-out, then |
375 | * we use that version of the data for the commit. | 391 | * we use that version of the data for the commit. |
376 | */ | 392 | */ |
377 | jbd_lock_bh_state(bh_in); | ||
378 | repeat: | ||
379 | if (jh_in->b_frozen_data) { | 393 | if (jh_in->b_frozen_data) { |
380 | done_copy_out = 1; | 394 | done_copy_out = 1; |
381 | new_page = virt_to_page(jh_in->b_frozen_data); | 395 | new_page = virt_to_page(jh_in->b_frozen_data); |
@@ -415,7 +429,7 @@ repeat: | |||
415 | jbd_unlock_bh_state(bh_in); | 429 | jbd_unlock_bh_state(bh_in); |
416 | tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); | 430 | tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); |
417 | if (!tmp) { | 431 | if (!tmp) { |
418 | jbd2_journal_put_journal_head(new_jh); | 432 | brelse(new_bh); |
419 | return -ENOMEM; | 433 | return -ENOMEM; |
420 | } | 434 | } |
421 | jbd_lock_bh_state(bh_in); | 435 | jbd_lock_bh_state(bh_in); |
@@ -426,7 +440,7 @@ repeat: | |||
426 | 440 | ||
427 | jh_in->b_frozen_data = tmp; | 441 | jh_in->b_frozen_data = tmp; |
428 | mapped_data = kmap_atomic(new_page); | 442 | mapped_data = kmap_atomic(new_page); |
429 | memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); | 443 | memcpy(tmp, mapped_data + new_offset, bh_in->b_size); |
430 | kunmap_atomic(mapped_data); | 444 | kunmap_atomic(mapped_data); |
431 | 445 | ||
432 | new_page = virt_to_page(tmp); | 446 | new_page = virt_to_page(tmp); |
@@ -452,14 +466,14 @@ repeat: | |||
452 | } | 466 | } |
453 | 467 | ||
454 | set_bh_page(new_bh, new_page, new_offset); | 468 | set_bh_page(new_bh, new_page, new_offset); |
455 | new_jh->b_transaction = NULL; | 469 | new_bh->b_size = bh_in->b_size; |
456 | new_bh->b_size = jh2bh(jh_in)->b_size; | 470 | new_bh->b_bdev = journal->j_dev; |
457 | new_bh->b_bdev = transaction->t_journal->j_dev; | ||
458 | new_bh->b_blocknr = blocknr; | 471 | new_bh->b_blocknr = blocknr; |
472 | new_bh->b_private = bh_in; | ||
459 | set_buffer_mapped(new_bh); | 473 | set_buffer_mapped(new_bh); |
460 | set_buffer_dirty(new_bh); | 474 | set_buffer_dirty(new_bh); |
461 | 475 | ||
462 | *jh_out = new_jh; | 476 | *bh_out = new_bh; |
463 | 477 | ||
464 | /* | 478 | /* |
465 | * The to-be-written buffer needs to get moved to the io queue, | 479 | * The to-be-written buffer needs to get moved to the io queue, |
@@ -470,11 +484,9 @@ repeat: | |||
470 | spin_lock(&journal->j_list_lock); | 484 | spin_lock(&journal->j_list_lock); |
471 | __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); | 485 | __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); |
472 | spin_unlock(&journal->j_list_lock); | 486 | spin_unlock(&journal->j_list_lock); |
487 | set_buffer_shadow(bh_in); | ||
473 | jbd_unlock_bh_state(bh_in); | 488 | jbd_unlock_bh_state(bh_in); |
474 | 489 | ||
475 | JBUFFER_TRACE(new_jh, "file as BJ_IO"); | ||
476 | jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); | ||
477 | |||
478 | return do_escape | (done_copy_out << 1); | 490 | return do_escape | (done_copy_out << 1); |
479 | } | 491 | } |
480 | 492 | ||
@@ -484,35 +496,6 @@ repeat: | |||
484 | */ | 496 | */ |
485 | 497 | ||
486 | /* | 498 | /* |
487 | * __jbd2_log_space_left: Return the number of free blocks left in the journal. | ||
488 | * | ||
489 | * Called with the journal already locked. | ||
490 | * | ||
491 | * Called under j_state_lock | ||
492 | */ | ||
493 | |||
494 | int __jbd2_log_space_left(journal_t *journal) | ||
495 | { | ||
496 | int left = journal->j_free; | ||
497 | |||
498 | /* assert_spin_locked(&journal->j_state_lock); */ | ||
499 | |||
500 | /* | ||
501 | * Be pessimistic here about the number of those free blocks which | ||
502 | * might be required for log descriptor control blocks. | ||
503 | */ | ||
504 | |||
505 | #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ | ||
506 | |||
507 | left -= MIN_LOG_RESERVED_BLOCKS; | ||
508 | |||
509 | if (left <= 0) | ||
510 | return 0; | ||
511 | left -= (left >> 3); | ||
512 | return left; | ||
513 | } | ||
514 | |||
515 | /* | ||
516 | * Called with j_state_lock locked for writing. | 499 | * Called with j_state_lock locked for writing. |
517 | * Returns true if a transaction commit was started. | 500 | * Returns true if a transaction commit was started. |
518 | */ | 501 | */ |
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid) | |||
564 | } | 547 | } |
565 | 548 | ||
566 | /* | 549 | /* |
567 | * Force and wait upon a commit if the calling process is not within | 550 | * Force and wait any uncommitted transactions. We can only force the running |
568 | * transaction. This is used for forcing out undo-protected data which contains | 551 | * transaction if we don't have an active handle, otherwise, we will deadlock. |
569 | * bitmaps, when the fs is running out of space. | 552 | * Returns: <0 in case of error, |
570 | * | 553 | * 0 if nothing to commit, |
571 | * We can only force the running transaction if we don't have an active handle; | 554 | * 1 if transaction was successfully committed. |
572 | * otherwise, we will deadlock. | ||
573 | * | ||
574 | * Returns true if a transaction was started. | ||
575 | */ | 555 | */ |
576 | int jbd2_journal_force_commit_nested(journal_t *journal) | 556 | static int __jbd2_journal_force_commit(journal_t *journal) |
577 | { | 557 | { |
578 | transaction_t *transaction = NULL; | 558 | transaction_t *transaction = NULL; |
579 | tid_t tid; | 559 | tid_t tid; |
580 | int need_to_start = 0; | 560 | int need_to_start = 0, ret = 0; |
581 | 561 | ||
582 | read_lock(&journal->j_state_lock); | 562 | read_lock(&journal->j_state_lock); |
583 | if (journal->j_running_transaction && !current->journal_info) { | 563 | if (journal->j_running_transaction && !current->journal_info) { |
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal) | |||
588 | transaction = journal->j_committing_transaction; | 568 | transaction = journal->j_committing_transaction; |
589 | 569 | ||
590 | if (!transaction) { | 570 | if (!transaction) { |
571 | /* Nothing to commit */ | ||
591 | read_unlock(&journal->j_state_lock); | 572 | read_unlock(&journal->j_state_lock); |
592 | return 0; /* Nothing to retry */ | 573 | return 0; |
593 | } | 574 | } |
594 | |||
595 | tid = transaction->t_tid; | 575 | tid = transaction->t_tid; |
596 | read_unlock(&journal->j_state_lock); | 576 | read_unlock(&journal->j_state_lock); |
597 | if (need_to_start) | 577 | if (need_to_start) |
598 | jbd2_log_start_commit(journal, tid); | 578 | jbd2_log_start_commit(journal, tid); |
599 | jbd2_log_wait_commit(journal, tid); | 579 | ret = jbd2_log_wait_commit(journal, tid); |
600 | return 1; | 580 | if (!ret) |
581 | ret = 1; | ||
582 | |||
583 | return ret; | ||
584 | } | ||
585 | |||
586 | /** | ||
587 | * Force and wait upon a commit if the calling process is not within | ||
588 | * transaction. This is used for forcing out undo-protected data which contains | ||
589 | * bitmaps, when the fs is running out of space. | ||
590 | * | ||
591 | * @journal: journal to force | ||
592 | * Returns true if progress was made. | ||
593 | */ | ||
594 | int jbd2_journal_force_commit_nested(journal_t *journal) | ||
595 | { | ||
596 | int ret; | ||
597 | |||
598 | ret = __jbd2_journal_force_commit(journal); | ||
599 | return ret > 0; | ||
600 | } | ||
601 | |||
602 | /** | ||
603 | * int journal_force_commit() - force any uncommitted transactions | ||
604 | * @journal: journal to force | ||
605 | * | ||
606 | * Caller want unconditional commit. We can only force the running transaction | ||
607 | * if we don't have an active handle, otherwise, we will deadlock. | ||
608 | */ | ||
609 | int jbd2_journal_force_commit(journal_t *journal) | ||
610 | { | ||
611 | int ret; | ||
612 | |||
613 | J_ASSERT(!current->journal_info); | ||
614 | ret = __jbd2_journal_force_commit(journal); | ||
615 | if (ret > 0) | ||
616 | ret = 0; | ||
617 | return ret; | ||
601 | } | 618 | } |
602 | 619 | ||
603 | /* | 620 | /* |
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, | |||
798 | * But we don't bother doing that, so there will be coherency problems with | 815 | * But we don't bother doing that, so there will be coherency problems with |
799 | * mmaps of blockdevs which hold live JBD-controlled filesystems. | 816 | * mmaps of blockdevs which hold live JBD-controlled filesystems. |
800 | */ | 817 | */ |
801 | struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) | 818 | struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) |
802 | { | 819 | { |
803 | struct buffer_head *bh; | 820 | struct buffer_head *bh; |
804 | unsigned long long blocknr; | 821 | unsigned long long blocknr; |
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) | |||
817 | set_buffer_uptodate(bh); | 834 | set_buffer_uptodate(bh); |
818 | unlock_buffer(bh); | 835 | unlock_buffer(bh); |
819 | BUFFER_TRACE(bh, "return this buffer"); | 836 | BUFFER_TRACE(bh, "return this buffer"); |
820 | return jbd2_journal_add_journal_head(bh); | 837 | return bh; |
821 | } | 838 | } |
822 | 839 | ||
823 | /* | 840 | /* |
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void) | |||
1062 | return NULL; | 1079 | return NULL; |
1063 | 1080 | ||
1064 | init_waitqueue_head(&journal->j_wait_transaction_locked); | 1081 | init_waitqueue_head(&journal->j_wait_transaction_locked); |
1065 | init_waitqueue_head(&journal->j_wait_logspace); | ||
1066 | init_waitqueue_head(&journal->j_wait_done_commit); | 1082 | init_waitqueue_head(&journal->j_wait_done_commit); |
1067 | init_waitqueue_head(&journal->j_wait_checkpoint); | ||
1068 | init_waitqueue_head(&journal->j_wait_commit); | 1083 | init_waitqueue_head(&journal->j_wait_commit); |
1069 | init_waitqueue_head(&journal->j_wait_updates); | 1084 | init_waitqueue_head(&journal->j_wait_updates); |
1085 | init_waitqueue_head(&journal->j_wait_reserved); | ||
1070 | mutex_init(&journal->j_barrier); | 1086 | mutex_init(&journal->j_barrier); |
1071 | mutex_init(&journal->j_checkpoint_mutex); | 1087 | mutex_init(&journal->j_checkpoint_mutex); |
1072 | spin_lock_init(&journal->j_revoke_lock); | 1088 | spin_lock_init(&journal->j_revoke_lock); |
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void) | |||
1076 | journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); | 1092 | journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); |
1077 | journal->j_min_batch_time = 0; | 1093 | journal->j_min_batch_time = 0; |
1078 | journal->j_max_batch_time = 15000; /* 15ms */ | 1094 | journal->j_max_batch_time = 15000; /* 15ms */ |
1095 | atomic_set(&journal->j_reserved_credits, 0); | ||
1079 | 1096 | ||
1080 | /* The journal is marked for error until we succeed with recovery! */ | 1097 | /* The journal is marked for error until we succeed with recovery! */ |
1081 | journal->j_flags = JBD2_ABORT; | 1098 | journal->j_flags = JBD2_ABORT; |
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal) | |||
1318 | static void jbd2_write_superblock(journal_t *journal, int write_op) | 1335 | static void jbd2_write_superblock(journal_t *journal, int write_op) |
1319 | { | 1336 | { |
1320 | struct buffer_head *bh = journal->j_sb_buffer; | 1337 | struct buffer_head *bh = journal->j_sb_buffer; |
1338 | journal_superblock_t *sb = journal->j_superblock; | ||
1321 | int ret; | 1339 | int ret; |
1322 | 1340 | ||
1323 | trace_jbd2_write_superblock(journal, write_op); | 1341 | trace_jbd2_write_superblock(journal, write_op); |
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) | |||
1339 | clear_buffer_write_io_error(bh); | 1357 | clear_buffer_write_io_error(bh); |
1340 | set_buffer_uptodate(bh); | 1358 | set_buffer_uptodate(bh); |
1341 | } | 1359 | } |
1360 | jbd2_superblock_csum_set(journal, sb); | ||
1342 | get_bh(bh); | 1361 | get_bh(bh); |
1343 | bh->b_end_io = end_buffer_write_sync; | 1362 | bh->b_end_io = end_buffer_write_sync; |
1344 | ret = submit_bh(write_op, bh); | 1363 | ret = submit_bh(write_op, bh); |
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal) | |||
1435 | jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", | 1454 | jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", |
1436 | journal->j_errno); | 1455 | journal->j_errno); |
1437 | sb->s_errno = cpu_to_be32(journal->j_errno); | 1456 | sb->s_errno = cpu_to_be32(journal->j_errno); |
1438 | jbd2_superblock_csum_set(journal, sb); | ||
1439 | read_unlock(&journal->j_state_lock); | 1457 | read_unlock(&journal->j_state_lock); |
1440 | 1458 | ||
1441 | jbd2_write_superblock(journal, WRITE_SYNC); | 1459 | jbd2_write_superblock(journal, WRITE_SYNC); |
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void) | |||
2325 | #ifdef CONFIG_JBD2_DEBUG | 2343 | #ifdef CONFIG_JBD2_DEBUG |
2326 | atomic_inc(&nr_journal_heads); | 2344 | atomic_inc(&nr_journal_heads); |
2327 | #endif | 2345 | #endif |
2328 | ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); | 2346 | ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); |
2329 | if (!ret) { | 2347 | if (!ret) { |
2330 | jbd_debug(1, "out of memory for journal_head\n"); | 2348 | jbd_debug(1, "out of memory for journal_head\n"); |
2331 | pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); | 2349 | pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); |
2332 | while (!ret) { | 2350 | while (!ret) { |
2333 | yield(); | 2351 | yield(); |
2334 | ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); | 2352 | ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); |
2335 | } | 2353 | } |
2336 | } | 2354 | } |
2337 | return ret; | 2355 | return ret; |
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) | |||
2393 | struct journal_head *new_jh = NULL; | 2411 | struct journal_head *new_jh = NULL; |
2394 | 2412 | ||
2395 | repeat: | 2413 | repeat: |
2396 | if (!buffer_jbd(bh)) { | 2414 | if (!buffer_jbd(bh)) |
2397 | new_jh = journal_alloc_journal_head(); | 2415 | new_jh = journal_alloc_journal_head(); |
2398 | memset(new_jh, 0, sizeof(*new_jh)); | ||
2399 | } | ||
2400 | 2416 | ||
2401 | jbd_lock_bh_journal_head(bh); | 2417 | jbd_lock_bh_journal_head(bh); |
2402 | if (buffer_jbd(bh)) { | 2418 | if (buffer_jbd(bh)) { |
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 626846bac32f..d4851464b57e 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c | |||
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) | |||
399 | static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, | 399 | static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, |
400 | void *buf, __u32 sequence) | 400 | void *buf, __u32 sequence) |
401 | { | 401 | { |
402 | __u32 provided, calculated; | 402 | __u32 csum32; |
403 | 403 | ||
404 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) | 404 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) |
405 | return 1; | 405 | return 1; |
406 | 406 | ||
407 | sequence = cpu_to_be32(sequence); | 407 | sequence = cpu_to_be32(sequence); |
408 | calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, | 408 | csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, |
409 | sizeof(sequence)); | 409 | sizeof(sequence)); |
410 | calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); | 410 | csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); |
411 | provided = be32_to_cpu(tag->t_checksum); | ||
412 | 411 | ||
413 | return provided == cpu_to_be32(calculated); | 412 | return tag->t_checksum == cpu_to_be16(csum32); |
414 | } | 413 | } |
415 | 414 | ||
416 | static int do_one_pass(journal_t *journal, | 415 | static int do_one_pass(journal_t *journal, |
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index f30b80b4ce8b..198c9c10276d 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c | |||
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s | |||
122 | 122 | ||
123 | #ifdef __KERNEL__ | 123 | #ifdef __KERNEL__ |
124 | static void write_one_revoke_record(journal_t *, transaction_t *, | 124 | static void write_one_revoke_record(journal_t *, transaction_t *, |
125 | struct journal_head **, int *, | 125 | struct list_head *, |
126 | struct buffer_head **, int *, | ||
126 | struct jbd2_revoke_record_s *, int); | 127 | struct jbd2_revoke_record_s *, int); |
127 | static void flush_descriptor(journal_t *, struct journal_head *, int, int); | 128 | static void flush_descriptor(journal_t *, struct buffer_head *, int, int); |
128 | #endif | 129 | #endif |
129 | 130 | ||
130 | /* Utility functions to maintain the revoke table */ | 131 | /* Utility functions to maintain the revoke table */ |
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) | |||
531 | */ | 532 | */ |
532 | void jbd2_journal_write_revoke_records(journal_t *journal, | 533 | void jbd2_journal_write_revoke_records(journal_t *journal, |
533 | transaction_t *transaction, | 534 | transaction_t *transaction, |
535 | struct list_head *log_bufs, | ||
534 | int write_op) | 536 | int write_op) |
535 | { | 537 | { |
536 | struct journal_head *descriptor; | 538 | struct buffer_head *descriptor; |
537 | struct jbd2_revoke_record_s *record; | 539 | struct jbd2_revoke_record_s *record; |
538 | struct jbd2_revoke_table_s *revoke; | 540 | struct jbd2_revoke_table_s *revoke; |
539 | struct list_head *hash_list; | 541 | struct list_head *hash_list; |
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal, | |||
553 | while (!list_empty(hash_list)) { | 555 | while (!list_empty(hash_list)) { |
554 | record = (struct jbd2_revoke_record_s *) | 556 | record = (struct jbd2_revoke_record_s *) |
555 | hash_list->next; | 557 | hash_list->next; |
556 | write_one_revoke_record(journal, transaction, | 558 | write_one_revoke_record(journal, transaction, log_bufs, |
557 | &descriptor, &offset, | 559 | &descriptor, &offset, |
558 | record, write_op); | 560 | record, write_op); |
559 | count++; | 561 | count++; |
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal, | |||
573 | 575 | ||
574 | static void write_one_revoke_record(journal_t *journal, | 576 | static void write_one_revoke_record(journal_t *journal, |
575 | transaction_t *transaction, | 577 | transaction_t *transaction, |
576 | struct journal_head **descriptorp, | 578 | struct list_head *log_bufs, |
579 | struct buffer_head **descriptorp, | ||
577 | int *offsetp, | 580 | int *offsetp, |
578 | struct jbd2_revoke_record_s *record, | 581 | struct jbd2_revoke_record_s *record, |
579 | int write_op) | 582 | int write_op) |
580 | { | 583 | { |
581 | int csum_size = 0; | 584 | int csum_size = 0; |
582 | struct journal_head *descriptor; | 585 | struct buffer_head *descriptor; |
583 | int offset; | 586 | int offset; |
584 | journal_header_t *header; | 587 | journal_header_t *header; |
585 | 588 | ||
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal, | |||
609 | descriptor = jbd2_journal_get_descriptor_buffer(journal); | 612 | descriptor = jbd2_journal_get_descriptor_buffer(journal); |
610 | if (!descriptor) | 613 | if (!descriptor) |
611 | return; | 614 | return; |
612 | header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; | 615 | header = (journal_header_t *)descriptor->b_data; |
613 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 616 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
614 | header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); | 617 | header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); |
615 | header->h_sequence = cpu_to_be32(transaction->t_tid); | 618 | header->h_sequence = cpu_to_be32(transaction->t_tid); |
616 | 619 | ||
617 | /* Record it so that we can wait for IO completion later */ | 620 | /* Record it so that we can wait for IO completion later */ |
618 | JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); | 621 | BUFFER_TRACE(descriptor, "file in log_bufs"); |
619 | jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl); | 622 | jbd2_file_log_bh(log_bufs, descriptor); |
620 | 623 | ||
621 | offset = sizeof(jbd2_journal_revoke_header_t); | 624 | offset = sizeof(jbd2_journal_revoke_header_t); |
622 | *descriptorp = descriptor; | 625 | *descriptorp = descriptor; |
623 | } | 626 | } |
624 | 627 | ||
625 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { | 628 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { |
626 | * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = | 629 | * ((__be64 *)(&descriptor->b_data[offset])) = |
627 | cpu_to_be64(record->blocknr); | 630 | cpu_to_be64(record->blocknr); |
628 | offset += 8; | 631 | offset += 8; |
629 | 632 | ||
630 | } else { | 633 | } else { |
631 | * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = | 634 | * ((__be32 *)(&descriptor->b_data[offset])) = |
632 | cpu_to_be32(record->blocknr); | 635 | cpu_to_be32(record->blocknr); |
633 | offset += 4; | 636 | offset += 4; |
634 | } | 637 | } |
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal, | |||
636 | *offsetp = offset; | 639 | *offsetp = offset; |
637 | } | 640 | } |
638 | 641 | ||
639 | static void jbd2_revoke_csum_set(journal_t *j, | 642 | static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) |
640 | struct journal_head *descriptor) | ||
641 | { | 643 | { |
642 | struct jbd2_journal_revoke_tail *tail; | 644 | struct jbd2_journal_revoke_tail *tail; |
643 | __u32 csum; | 645 | __u32 csum; |
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j, | |||
645 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) | 647 | if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) |
646 | return; | 648 | return; |
647 | 649 | ||
648 | tail = (struct jbd2_journal_revoke_tail *) | 650 | tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - |
649 | (jh2bh(descriptor)->b_data + j->j_blocksize - | ||
650 | sizeof(struct jbd2_journal_revoke_tail)); | 651 | sizeof(struct jbd2_journal_revoke_tail)); |
651 | tail->r_checksum = 0; | 652 | tail->r_checksum = 0; |
652 | csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, | 653 | csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); |
653 | j->j_blocksize); | ||
654 | tail->r_checksum = cpu_to_be32(csum); | 654 | tail->r_checksum = cpu_to_be32(csum); |
655 | } | 655 | } |
656 | 656 | ||
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j, | |||
662 | */ | 662 | */ |
663 | 663 | ||
664 | static void flush_descriptor(journal_t *journal, | 664 | static void flush_descriptor(journal_t *journal, |
665 | struct journal_head *descriptor, | 665 | struct buffer_head *descriptor, |
666 | int offset, int write_op) | 666 | int offset, int write_op) |
667 | { | 667 | { |
668 | jbd2_journal_revoke_header_t *header; | 668 | jbd2_journal_revoke_header_t *header; |
669 | struct buffer_head *bh = jh2bh(descriptor); | ||
670 | 669 | ||
671 | if (is_journal_aborted(journal)) { | 670 | if (is_journal_aborted(journal)) { |
672 | put_bh(bh); | 671 | put_bh(descriptor); |
673 | return; | 672 | return; |
674 | } | 673 | } |
675 | 674 | ||
676 | header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; | 675 | header = (jbd2_journal_revoke_header_t *)descriptor->b_data; |
677 | header->r_count = cpu_to_be32(offset); | 676 | header->r_count = cpu_to_be32(offset); |
678 | jbd2_revoke_csum_set(journal, descriptor); | 677 | jbd2_revoke_csum_set(journal, descriptor); |
679 | 678 | ||
680 | set_buffer_jwrite(bh); | 679 | set_buffer_jwrite(descriptor); |
681 | BUFFER_TRACE(bh, "write"); | 680 | BUFFER_TRACE(descriptor, "write"); |
682 | set_buffer_dirty(bh); | 681 | set_buffer_dirty(descriptor); |
683 | write_dirty_buffer(bh, write_op); | 682 | write_dirty_buffer(descriptor, write_op); |
684 | } | 683 | } |
685 | #endif | 684 | #endif |
686 | 685 | ||
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 10f524c59ea8..7aa9a32573bb 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) | |||
89 | transaction->t_expires = jiffies + journal->j_commit_interval; | 89 | transaction->t_expires = jiffies + journal->j_commit_interval; |
90 | spin_lock_init(&transaction->t_handle_lock); | 90 | spin_lock_init(&transaction->t_handle_lock); |
91 | atomic_set(&transaction->t_updates, 0); | 91 | atomic_set(&transaction->t_updates, 0); |
92 | atomic_set(&transaction->t_outstanding_credits, 0); | 92 | atomic_set(&transaction->t_outstanding_credits, |
93 | atomic_read(&journal->j_reserved_credits)); | ||
93 | atomic_set(&transaction->t_handle_count, 0); | 94 | atomic_set(&transaction->t_handle_count, 0); |
94 | INIT_LIST_HEAD(&transaction->t_inode_list); | 95 | INIT_LIST_HEAD(&transaction->t_inode_list); |
95 | INIT_LIST_HEAD(&transaction->t_private_list); | 96 | INIT_LIST_HEAD(&transaction->t_private_list); |
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction, | |||
141 | } | 142 | } |
142 | 143 | ||
143 | /* | 144 | /* |
145 | * Wait until running transaction passes T_LOCKED state. Also starts the commit | ||
146 | * if needed. The function expects running transaction to exist and releases | ||
147 | * j_state_lock. | ||
148 | */ | ||
149 | static void wait_transaction_locked(journal_t *journal) | ||
150 | __releases(journal->j_state_lock) | ||
151 | { | ||
152 | DEFINE_WAIT(wait); | ||
153 | int need_to_start; | ||
154 | tid_t tid = journal->j_running_transaction->t_tid; | ||
155 | |||
156 | prepare_to_wait(&journal->j_wait_transaction_locked, &wait, | ||
157 | TASK_UNINTERRUPTIBLE); | ||
158 | need_to_start = !tid_geq(journal->j_commit_request, tid); | ||
159 | read_unlock(&journal->j_state_lock); | ||
160 | if (need_to_start) | ||
161 | jbd2_log_start_commit(journal, tid); | ||
162 | schedule(); | ||
163 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
164 | } | ||
165 | |||
166 | static void sub_reserved_credits(journal_t *journal, int blocks) | ||
167 | { | ||
168 | atomic_sub(blocks, &journal->j_reserved_credits); | ||
169 | wake_up(&journal->j_wait_reserved); | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Wait until we can add credits for handle to the running transaction. Called | ||
174 | * with j_state_lock held for reading. Returns 0 if handle joined the running | ||
175 | * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and | ||
176 | * caller must retry. | ||
177 | */ | ||
178 | static int add_transaction_credits(journal_t *journal, int blocks, | ||
179 | int rsv_blocks) | ||
180 | { | ||
181 | transaction_t *t = journal->j_running_transaction; | ||
182 | int needed; | ||
183 | int total = blocks + rsv_blocks; | ||
184 | |||
185 | /* | ||
186 | * If the current transaction is locked down for commit, wait | ||
187 | * for the lock to be released. | ||
188 | */ | ||
189 | if (t->t_state == T_LOCKED) { | ||
190 | wait_transaction_locked(journal); | ||
191 | return 1; | ||
192 | } | ||
193 | |||
194 | /* | ||
195 | * If there is not enough space left in the log to write all | ||
196 | * potential buffers requested by this operation, we need to | ||
197 | * stall pending a log checkpoint to free some more log space. | ||
198 | */ | ||
199 | needed = atomic_add_return(total, &t->t_outstanding_credits); | ||
200 | if (needed > journal->j_max_transaction_buffers) { | ||
201 | /* | ||
202 | * If the current transaction is already too large, | ||
203 | * then start to commit it: we can then go back and | ||
204 | * attach this handle to a new transaction. | ||
205 | */ | ||
206 | atomic_sub(total, &t->t_outstanding_credits); | ||
207 | wait_transaction_locked(journal); | ||
208 | return 1; | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * The commit code assumes that it can get enough log space | ||
213 | * without forcing a checkpoint. This is *critical* for | ||
214 | * correctness: a checkpoint of a buffer which is also | ||
215 | * associated with a committing transaction creates a deadlock, | ||
216 | * so commit simply cannot force through checkpoints. | ||
217 | * | ||
218 | * We must therefore ensure the necessary space in the journal | ||
219 | * *before* starting to dirty potentially checkpointed buffers | ||
220 | * in the new transaction. | ||
221 | */ | ||
222 | if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { | ||
223 | atomic_sub(total, &t->t_outstanding_credits); | ||
224 | read_unlock(&journal->j_state_lock); | ||
225 | write_lock(&journal->j_state_lock); | ||
226 | if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) | ||
227 | __jbd2_log_wait_for_space(journal); | ||
228 | write_unlock(&journal->j_state_lock); | ||
229 | return 1; | ||
230 | } | ||
231 | |||
232 | /* No reservation? We are done... */ | ||
233 | if (!rsv_blocks) | ||
234 | return 0; | ||
235 | |||
236 | needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); | ||
237 | /* We allow at most half of a transaction to be reserved */ | ||
238 | if (needed > journal->j_max_transaction_buffers / 2) { | ||
239 | sub_reserved_credits(journal, rsv_blocks); | ||
240 | atomic_sub(total, &t->t_outstanding_credits); | ||
241 | read_unlock(&journal->j_state_lock); | ||
242 | wait_event(journal->j_wait_reserved, | ||
243 | atomic_read(&journal->j_reserved_credits) + rsv_blocks | ||
244 | <= journal->j_max_transaction_buffers / 2); | ||
245 | return 1; | ||
246 | } | ||
247 | return 0; | ||
248 | } | ||
249 | |||
250 | /* | ||
144 | * start_this_handle: Given a handle, deal with any locking or stalling | 251 | * start_this_handle: Given a handle, deal with any locking or stalling |
145 | * needed to make sure that there is enough journal space for the handle | 252 | * needed to make sure that there is enough journal space for the handle |
146 | * to begin. Attach the handle to a transaction and set up the | 253 | * to begin. Attach the handle to a transaction and set up the |
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle, | |||
151 | gfp_t gfp_mask) | 258 | gfp_t gfp_mask) |
152 | { | 259 | { |
153 | transaction_t *transaction, *new_transaction = NULL; | 260 | transaction_t *transaction, *new_transaction = NULL; |
154 | tid_t tid; | 261 | int blocks = handle->h_buffer_credits; |
155 | int needed, need_to_start; | 262 | int rsv_blocks = 0; |
156 | int nblocks = handle->h_buffer_credits; | ||
157 | unsigned long ts = jiffies; | 263 | unsigned long ts = jiffies; |
158 | 264 | ||
159 | if (nblocks > journal->j_max_transaction_buffers) { | 265 | /* |
266 | * 1/2 of transaction can be reserved so we can practically handle | ||
267 | * only 1/2 of maximum transaction size per operation | ||
268 | */ | ||
269 | if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { | ||
160 | printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", | 270 | printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", |
161 | current->comm, nblocks, | 271 | current->comm, blocks, |
162 | journal->j_max_transaction_buffers); | 272 | journal->j_max_transaction_buffers / 2); |
163 | return -ENOSPC; | 273 | return -ENOSPC; |
164 | } | 274 | } |
165 | 275 | ||
276 | if (handle->h_rsv_handle) | ||
277 | rsv_blocks = handle->h_rsv_handle->h_buffer_credits; | ||
278 | |||
166 | alloc_transaction: | 279 | alloc_transaction: |
167 | if (!journal->j_running_transaction) { | 280 | if (!journal->j_running_transaction) { |
168 | new_transaction = kmem_cache_zalloc(transaction_cache, | 281 | new_transaction = kmem_cache_zalloc(transaction_cache, |
@@ -199,8 +312,12 @@ repeat: | |||
199 | return -EROFS; | 312 | return -EROFS; |
200 | } | 313 | } |
201 | 314 | ||
202 | /* Wait on the journal's transaction barrier if necessary */ | 315 | /* |
203 | if (journal->j_barrier_count) { | 316 | * Wait on the journal's transaction barrier if necessary. Specifically |
317 | * we allow reserved handles to proceed because otherwise commit could | ||
318 | * deadlock on page writeback not being able to complete. | ||
319 | */ | ||
320 | if (!handle->h_reserved && journal->j_barrier_count) { | ||
204 | read_unlock(&journal->j_state_lock); | 321 | read_unlock(&journal->j_state_lock); |
205 | wait_event(journal->j_wait_transaction_locked, | 322 | wait_event(journal->j_wait_transaction_locked, |
206 | journal->j_barrier_count == 0); | 323 | journal->j_barrier_count == 0); |
@@ -213,7 +330,7 @@ repeat: | |||
213 | goto alloc_transaction; | 330 | goto alloc_transaction; |
214 | write_lock(&journal->j_state_lock); | 331 | write_lock(&journal->j_state_lock); |
215 | if (!journal->j_running_transaction && | 332 | if (!journal->j_running_transaction && |
216 | !journal->j_barrier_count) { | 333 | (handle->h_reserved || !journal->j_barrier_count)) { |
217 | jbd2_get_transaction(journal, new_transaction); | 334 | jbd2_get_transaction(journal, new_transaction); |
218 | new_transaction = NULL; | 335 | new_transaction = NULL; |
219 | } | 336 | } |
@@ -223,85 +340,18 @@ repeat: | |||
223 | 340 | ||
224 | transaction = journal->j_running_transaction; | 341 | transaction = journal->j_running_transaction; |
225 | 342 | ||
226 | /* | 343 | if (!handle->h_reserved) { |
227 | * If the current transaction is locked down for commit, wait for the | 344 | /* We may have dropped j_state_lock - restart in that case */ |
228 | * lock to be released. | 345 | if (add_transaction_credits(journal, blocks, rsv_blocks)) |
229 | */ | 346 | goto repeat; |
230 | if (transaction->t_state == T_LOCKED) { | 347 | } else { |
231 | DEFINE_WAIT(wait); | ||
232 | |||
233 | prepare_to_wait(&journal->j_wait_transaction_locked, | ||
234 | &wait, TASK_UNINTERRUPTIBLE); | ||
235 | read_unlock(&journal->j_state_lock); | ||
236 | schedule(); | ||
237 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
238 | goto repeat; | ||
239 | } | ||
240 | |||
241 | /* | ||
242 | * If there is not enough space left in the log to write all potential | ||
243 | * buffers requested by this operation, we need to stall pending a log | ||
244 | * checkpoint to free some more log space. | ||
245 | */ | ||
246 | needed = atomic_add_return(nblocks, | ||
247 | &transaction->t_outstanding_credits); | ||
248 | |||
249 | if (needed > journal->j_max_transaction_buffers) { | ||
250 | /* | 348 | /* |
251 | * If the current transaction is already too large, then start | 349 | * We have handle reserved so we are allowed to join T_LOCKED |
252 | * to commit it: we can then go back and attach this handle to | 350 | * transaction and we don't have to check for transaction size |
253 | * a new transaction. | 351 | * and journal space. |
254 | */ | 352 | */ |
255 | DEFINE_WAIT(wait); | 353 | sub_reserved_credits(journal, blocks); |
256 | 354 | handle->h_reserved = 0; | |
257 | jbd_debug(2, "Handle %p starting new commit...\n", handle); | ||
258 | atomic_sub(nblocks, &transaction->t_outstanding_credits); | ||
259 | prepare_to_wait(&journal->j_wait_transaction_locked, &wait, | ||
260 | TASK_UNINTERRUPTIBLE); | ||
261 | tid = transaction->t_tid; | ||
262 | need_to_start = !tid_geq(journal->j_commit_request, tid); | ||
263 | read_unlock(&journal->j_state_lock); | ||
264 | if (need_to_start) | ||
265 | jbd2_log_start_commit(journal, tid); | ||
266 | schedule(); | ||
267 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
268 | goto repeat; | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * The commit code assumes that it can get enough log space | ||
273 | * without forcing a checkpoint. This is *critical* for | ||
274 | * correctness: a checkpoint of a buffer which is also | ||
275 | * associated with a committing transaction creates a deadlock, | ||
276 | * so commit simply cannot force through checkpoints. | ||
277 | * | ||
278 | * We must therefore ensure the necessary space in the journal | ||
279 | * *before* starting to dirty potentially checkpointed buffers | ||
280 | * in the new transaction. | ||
281 | * | ||
282 | * The worst part is, any transaction currently committing can | ||
283 | * reduce the free space arbitrarily. Be careful to account for | ||
284 | * those buffers when checkpointing. | ||
285 | */ | ||
286 | |||
287 | /* | ||
288 | * @@@ AKPM: This seems rather over-defensive. We're giving commit | ||
289 | * a _lot_ of headroom: 1/4 of the journal plus the size of | ||
290 | * the committing transaction. Really, we only need to give it | ||
291 | * committing_transaction->t_outstanding_credits plus "enough" for | ||
292 | * the log control blocks. | ||
293 | * Also, this test is inconsistent with the matching one in | ||
294 | * jbd2_journal_extend(). | ||
295 | */ | ||
296 | if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { | ||
297 | jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); | ||
298 | atomic_sub(nblocks, &transaction->t_outstanding_credits); | ||
299 | read_unlock(&journal->j_state_lock); | ||
300 | write_lock(&journal->j_state_lock); | ||
301 | if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) | ||
302 | __jbd2_log_wait_for_space(journal); | ||
303 | write_unlock(&journal->j_state_lock); | ||
304 | goto repeat; | ||
305 | } | 355 | } |
306 | 356 | ||
307 | /* OK, account for the buffers that this operation expects to | 357 | /* OK, account for the buffers that this operation expects to |
@@ -309,15 +359,16 @@ repeat: | |||
309 | */ | 359 | */ |
310 | update_t_max_wait(transaction, ts); | 360 | update_t_max_wait(transaction, ts); |
311 | handle->h_transaction = transaction; | 361 | handle->h_transaction = transaction; |
312 | handle->h_requested_credits = nblocks; | 362 | handle->h_requested_credits = blocks; |
313 | handle->h_start_jiffies = jiffies; | 363 | handle->h_start_jiffies = jiffies; |
314 | atomic_inc(&transaction->t_updates); | 364 | atomic_inc(&transaction->t_updates); |
315 | atomic_inc(&transaction->t_handle_count); | 365 | atomic_inc(&transaction->t_handle_count); |
316 | jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", | 366 | jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", |
317 | handle, nblocks, | 367 | handle, blocks, |
318 | atomic_read(&transaction->t_outstanding_credits), | 368 | atomic_read(&transaction->t_outstanding_credits), |
319 | __jbd2_log_space_left(journal)); | 369 | jbd2_log_space_left(journal)); |
320 | read_unlock(&journal->j_state_lock); | 370 | read_unlock(&journal->j_state_lock); |
371 | current->journal_info = handle; | ||
321 | 372 | ||
322 | lock_map_acquire(&handle->h_lockdep_map); | 373 | lock_map_acquire(&handle->h_lockdep_map); |
323 | jbd2_journal_free_transaction(new_transaction); | 374 | jbd2_journal_free_transaction(new_transaction); |
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks) | |||
348 | * | 399 | * |
349 | * We make sure that the transaction can guarantee at least nblocks of | 400 | * We make sure that the transaction can guarantee at least nblocks of |
350 | * modified buffers in the log. We block until the log can guarantee | 401 | * modified buffers in the log. We block until the log can guarantee |
351 | * that much space. | 402 | * that much space. Additionally, if rsv_blocks > 0, we also create another |
352 | * | 403 | * handle with rsv_blocks reserved blocks in the journal. This handle is |
353 | * This function is visible to journal users (like ext3fs), so is not | 404 | * is stored in h_rsv_handle. It is not attached to any particular transaction |
354 | * called with the journal already locked. | 405 | * and thus doesn't block transaction commit. If the caller uses this reserved |
406 | * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() | ||
407 | * on the parent handle will dispose the reserved one. Reserved handle has to | ||
408 | * be converted to a normal handle using jbd2_journal_start_reserved() before | ||
409 | * it can be used. | ||
355 | * | 410 | * |
356 | * Return a pointer to a newly allocated handle, or an ERR_PTR() value | 411 | * Return a pointer to a newly allocated handle, or an ERR_PTR() value |
357 | * on failure. | 412 | * on failure. |
358 | */ | 413 | */ |
359 | handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, | 414 | handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, |
360 | unsigned int type, unsigned int line_no) | 415 | gfp_t gfp_mask, unsigned int type, |
416 | unsigned int line_no) | ||
361 | { | 417 | { |
362 | handle_t *handle = journal_current_handle(); | 418 | handle_t *handle = journal_current_handle(); |
363 | int err; | 419 | int err; |
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, | |||
374 | handle = new_handle(nblocks); | 430 | handle = new_handle(nblocks); |
375 | if (!handle) | 431 | if (!handle) |
376 | return ERR_PTR(-ENOMEM); | 432 | return ERR_PTR(-ENOMEM); |
433 | if (rsv_blocks) { | ||
434 | handle_t *rsv_handle; | ||
377 | 435 | ||
378 | current->journal_info = handle; | 436 | rsv_handle = new_handle(rsv_blocks); |
437 | if (!rsv_handle) { | ||
438 | jbd2_free_handle(handle); | ||
439 | return ERR_PTR(-ENOMEM); | ||
440 | } | ||
441 | rsv_handle->h_reserved = 1; | ||
442 | rsv_handle->h_journal = journal; | ||
443 | handle->h_rsv_handle = rsv_handle; | ||
444 | } | ||
379 | 445 | ||
380 | err = start_this_handle(journal, handle, gfp_mask); | 446 | err = start_this_handle(journal, handle, gfp_mask); |
381 | if (err < 0) { | 447 | if (err < 0) { |
448 | if (handle->h_rsv_handle) | ||
449 | jbd2_free_handle(handle->h_rsv_handle); | ||
382 | jbd2_free_handle(handle); | 450 | jbd2_free_handle(handle); |
383 | current->journal_info = NULL; | ||
384 | return ERR_PTR(err); | 451 | return ERR_PTR(err); |
385 | } | 452 | } |
386 | handle->h_type = type; | 453 | handle->h_type = type; |
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start); | |||
395 | 462 | ||
396 | handle_t *jbd2_journal_start(journal_t *journal, int nblocks) | 463 | handle_t *jbd2_journal_start(journal_t *journal, int nblocks) |
397 | { | 464 | { |
398 | return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); | 465 | return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); |
399 | } | 466 | } |
400 | EXPORT_SYMBOL(jbd2_journal_start); | 467 | EXPORT_SYMBOL(jbd2_journal_start); |
401 | 468 | ||
469 | void jbd2_journal_free_reserved(handle_t *handle) | ||
470 | { | ||
471 | journal_t *journal = handle->h_journal; | ||
472 | |||
473 | WARN_ON(!handle->h_reserved); | ||
474 | sub_reserved_credits(journal, handle->h_buffer_credits); | ||
475 | jbd2_free_handle(handle); | ||
476 | } | ||
477 | EXPORT_SYMBOL(jbd2_journal_free_reserved); | ||
478 | |||
479 | /** | ||
480 | * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle | ||
481 | * @handle: handle to start | ||
482 | * | ||
483 | * Start handle that has been previously reserved with jbd2_journal_reserve(). | ||
484 | * This attaches @handle to the running transaction (or creates one if there's | ||
485 | * not transaction running). Unlike jbd2_journal_start() this function cannot | ||
486 | * block on journal commit, checkpointing, or similar stuff. It can block on | ||
487 | * memory allocation or frozen journal though. | ||
488 | * | ||
489 | * Return 0 on success, non-zero on error - handle is freed in that case. | ||
490 | */ | ||
491 | int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, | ||
492 | unsigned int line_no) | ||
493 | { | ||
494 | journal_t *journal = handle->h_journal; | ||
495 | int ret = -EIO; | ||
496 | |||
497 | if (WARN_ON(!handle->h_reserved)) { | ||
498 | /* Someone passed in normal handle? Just stop it. */ | ||
499 | jbd2_journal_stop(handle); | ||
500 | return ret; | ||
501 | } | ||
502 | /* | ||
503 | * Usefulness of mixing of reserved and unreserved handles is | ||
504 | * questionable. So far nobody seems to need it so just error out. | ||
505 | */ | ||
506 | if (WARN_ON(current->journal_info)) { | ||
507 | jbd2_journal_free_reserved(handle); | ||
508 | return ret; | ||
509 | } | ||
510 | |||
511 | handle->h_journal = NULL; | ||
512 | /* | ||
513 | * GFP_NOFS is here because callers are likely from writeback or | ||
514 | * similarly constrained call sites | ||
515 | */ | ||
516 | ret = start_this_handle(journal, handle, GFP_NOFS); | ||
517 | if (ret < 0) | ||
518 | jbd2_journal_free_reserved(handle); | ||
519 | handle->h_type = type; | ||
520 | handle->h_line_no = line_no; | ||
521 | return ret; | ||
522 | } | ||
523 | EXPORT_SYMBOL(jbd2_journal_start_reserved); | ||
402 | 524 | ||
403 | /** | 525 | /** |
404 | * int jbd2_journal_extend() - extend buffer credits. | 526 | * int jbd2_journal_extend() - extend buffer credits. |
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start); | |||
423 | int jbd2_journal_extend(handle_t *handle, int nblocks) | 545 | int jbd2_journal_extend(handle_t *handle, int nblocks) |
424 | { | 546 | { |
425 | transaction_t *transaction = handle->h_transaction; | 547 | transaction_t *transaction = handle->h_transaction; |
426 | journal_t *journal = transaction->t_journal; | 548 | journal_t *journal; |
427 | int result; | 549 | int result; |
428 | int wanted; | 550 | int wanted; |
429 | 551 | ||
430 | result = -EIO; | 552 | WARN_ON(!transaction); |
431 | if (is_handle_aborted(handle)) | 553 | if (is_handle_aborted(handle)) |
432 | goto out; | 554 | return -EROFS; |
555 | journal = transaction->t_journal; | ||
433 | 556 | ||
434 | result = 1; | 557 | result = 1; |
435 | 558 | ||
436 | read_lock(&journal->j_state_lock); | 559 | read_lock(&journal->j_state_lock); |
437 | 560 | ||
438 | /* Don't extend a locked-down transaction! */ | 561 | /* Don't extend a locked-down transaction! */ |
439 | if (handle->h_transaction->t_state != T_RUNNING) { | 562 | if (transaction->t_state != T_RUNNING) { |
440 | jbd_debug(3, "denied handle %p %d blocks: " | 563 | jbd_debug(3, "denied handle %p %d blocks: " |
441 | "transaction not running\n", handle, nblocks); | 564 | "transaction not running\n", handle, nblocks); |
442 | goto error_out; | 565 | goto error_out; |
443 | } | 566 | } |
444 | 567 | ||
445 | spin_lock(&transaction->t_handle_lock); | 568 | spin_lock(&transaction->t_handle_lock); |
446 | wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; | 569 | wanted = atomic_add_return(nblocks, |
570 | &transaction->t_outstanding_credits); | ||
447 | 571 | ||
448 | if (wanted > journal->j_max_transaction_buffers) { | 572 | if (wanted > journal->j_max_transaction_buffers) { |
449 | jbd_debug(3, "denied handle %p %d blocks: " | 573 | jbd_debug(3, "denied handle %p %d blocks: " |
450 | "transaction too large\n", handle, nblocks); | 574 | "transaction too large\n", handle, nblocks); |
575 | atomic_sub(nblocks, &transaction->t_outstanding_credits); | ||
451 | goto unlock; | 576 | goto unlock; |
452 | } | 577 | } |
453 | 578 | ||
454 | if (wanted > __jbd2_log_space_left(journal)) { | 579 | if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) > |
580 | jbd2_log_space_left(journal)) { | ||
455 | jbd_debug(3, "denied handle %p %d blocks: " | 581 | jbd_debug(3, "denied handle %p %d blocks: " |
456 | "insufficient log space\n", handle, nblocks); | 582 | "insufficient log space\n", handle, nblocks); |
583 | atomic_sub(nblocks, &transaction->t_outstanding_credits); | ||
457 | goto unlock; | 584 | goto unlock; |
458 | } | 585 | } |
459 | 586 | ||
460 | trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, | 587 | trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, |
461 | handle->h_transaction->t_tid, | 588 | transaction->t_tid, |
462 | handle->h_type, handle->h_line_no, | 589 | handle->h_type, handle->h_line_no, |
463 | handle->h_buffer_credits, | 590 | handle->h_buffer_credits, |
464 | nblocks); | 591 | nblocks); |
465 | 592 | ||
466 | handle->h_buffer_credits += nblocks; | 593 | handle->h_buffer_credits += nblocks; |
467 | handle->h_requested_credits += nblocks; | 594 | handle->h_requested_credits += nblocks; |
468 | atomic_add(nblocks, &transaction->t_outstanding_credits); | ||
469 | result = 0; | 595 | result = 0; |
470 | 596 | ||
471 | jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); | 597 | jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); |
@@ -473,7 +599,6 @@ unlock: | |||
473 | spin_unlock(&transaction->t_handle_lock); | 599 | spin_unlock(&transaction->t_handle_lock); |
474 | error_out: | 600 | error_out: |
475 | read_unlock(&journal->j_state_lock); | 601 | read_unlock(&journal->j_state_lock); |
476 | out: | ||
477 | return result; | 602 | return result; |
478 | } | 603 | } |
479 | 604 | ||
@@ -490,19 +615,22 @@ out: | |||
490 | * to a running handle, a call to jbd2_journal_restart will commit the | 615 | * to a running handle, a call to jbd2_journal_restart will commit the |
491 | * handle's transaction so far and reattach the handle to a new | 616 | * handle's transaction so far and reattach the handle to a new |
492 | * transaction capabable of guaranteeing the requested number of | 617 | * transaction capabable of guaranteeing the requested number of |
493 | * credits. | 618 | * credits. We preserve reserved handle if there's any attached to the |
619 | * passed in handle. | ||
494 | */ | 620 | */ |
495 | int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) | 621 | int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) |
496 | { | 622 | { |
497 | transaction_t *transaction = handle->h_transaction; | 623 | transaction_t *transaction = handle->h_transaction; |
498 | journal_t *journal = transaction->t_journal; | 624 | journal_t *journal; |
499 | tid_t tid; | 625 | tid_t tid; |
500 | int need_to_start, ret; | 626 | int need_to_start, ret; |
501 | 627 | ||
628 | WARN_ON(!transaction); | ||
502 | /* If we've had an abort of any type, don't even think about | 629 | /* If we've had an abort of any type, don't even think about |
503 | * actually doing the restart! */ | 630 | * actually doing the restart! */ |
504 | if (is_handle_aborted(handle)) | 631 | if (is_handle_aborted(handle)) |
505 | return 0; | 632 | return 0; |
633 | journal = transaction->t_journal; | ||
506 | 634 | ||
507 | /* | 635 | /* |
508 | * First unlink the handle from its current transaction, and start the | 636 | * First unlink the handle from its current transaction, and start the |
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) | |||
515 | spin_lock(&transaction->t_handle_lock); | 643 | spin_lock(&transaction->t_handle_lock); |
516 | atomic_sub(handle->h_buffer_credits, | 644 | atomic_sub(handle->h_buffer_credits, |
517 | &transaction->t_outstanding_credits); | 645 | &transaction->t_outstanding_credits); |
646 | if (handle->h_rsv_handle) { | ||
647 | sub_reserved_credits(journal, | ||
648 | handle->h_rsv_handle->h_buffer_credits); | ||
649 | } | ||
518 | if (atomic_dec_and_test(&transaction->t_updates)) | 650 | if (atomic_dec_and_test(&transaction->t_updates)) |
519 | wake_up(&journal->j_wait_updates); | 651 | wake_up(&journal->j_wait_updates); |
652 | tid = transaction->t_tid; | ||
520 | spin_unlock(&transaction->t_handle_lock); | 653 | spin_unlock(&transaction->t_handle_lock); |
654 | handle->h_transaction = NULL; | ||
655 | current->journal_info = NULL; | ||
521 | 656 | ||
522 | jbd_debug(2, "restarting handle %p\n", handle); | 657 | jbd_debug(2, "restarting handle %p\n", handle); |
523 | tid = transaction->t_tid; | ||
524 | need_to_start = !tid_geq(journal->j_commit_request, tid); | 658 | need_to_start = !tid_geq(journal->j_commit_request, tid); |
525 | read_unlock(&journal->j_state_lock); | 659 | read_unlock(&journal->j_state_lock); |
526 | if (need_to_start) | 660 | if (need_to_start) |
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal) | |||
557 | write_lock(&journal->j_state_lock); | 691 | write_lock(&journal->j_state_lock); |
558 | ++journal->j_barrier_count; | 692 | ++journal->j_barrier_count; |
559 | 693 | ||
694 | /* Wait until there are no reserved handles */ | ||
695 | if (atomic_read(&journal->j_reserved_credits)) { | ||
696 | write_unlock(&journal->j_state_lock); | ||
697 | wait_event(journal->j_wait_reserved, | ||
698 | atomic_read(&journal->j_reserved_credits) == 0); | ||
699 | write_lock(&journal->j_state_lock); | ||
700 | } | ||
701 | |||
560 | /* Wait until there are no running updates */ | 702 | /* Wait until there are no running updates */ |
561 | while (1) { | 703 | while (1) { |
562 | transaction_t *transaction = journal->j_running_transaction; | 704 | transaction_t *transaction = journal->j_running_transaction; |
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh) | |||
619 | bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); | 761 | bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); |
620 | } | 762 | } |
621 | 763 | ||
764 | static int sleep_on_shadow_bh(void *word) | ||
765 | { | ||
766 | io_schedule(); | ||
767 | return 0; | ||
768 | } | ||
769 | |||
622 | /* | 770 | /* |
623 | * If the buffer is already part of the current transaction, then there | 771 | * If the buffer is already part of the current transaction, then there |
624 | * is nothing we need to do. If it is already part of a prior | 772 | * is nothing we need to do. If it is already part of a prior |
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, | |||
634 | int force_copy) | 782 | int force_copy) |
635 | { | 783 | { |
636 | struct buffer_head *bh; | 784 | struct buffer_head *bh; |
637 | transaction_t *transaction; | 785 | transaction_t *transaction = handle->h_transaction; |
638 | journal_t *journal; | 786 | journal_t *journal; |
639 | int error; | 787 | int error; |
640 | char *frozen_buffer = NULL; | 788 | char *frozen_buffer = NULL; |
641 | int need_copy = 0; | 789 | int need_copy = 0; |
642 | unsigned long start_lock, time_lock; | 790 | unsigned long start_lock, time_lock; |
643 | 791 | ||
792 | WARN_ON(!transaction); | ||
644 | if (is_handle_aborted(handle)) | 793 | if (is_handle_aborted(handle)) |
645 | return -EROFS; | 794 | return -EROFS; |
646 | |||
647 | transaction = handle->h_transaction; | ||
648 | journal = transaction->t_journal; | 795 | journal = transaction->t_journal; |
649 | 796 | ||
650 | jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); | 797 | jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); |
@@ -754,41 +901,29 @@ repeat: | |||
754 | * journaled. If the primary copy is already going to | 901 | * journaled. If the primary copy is already going to |
755 | * disk then we cannot do copy-out here. */ | 902 | * disk then we cannot do copy-out here. */ |
756 | 903 | ||
757 | if (jh->b_jlist == BJ_Shadow) { | 904 | if (buffer_shadow(bh)) { |
758 | DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); | ||
759 | wait_queue_head_t *wqh; | ||
760 | |||
761 | wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); | ||
762 | |||
763 | JBUFFER_TRACE(jh, "on shadow: sleep"); | 905 | JBUFFER_TRACE(jh, "on shadow: sleep"); |
764 | jbd_unlock_bh_state(bh); | 906 | jbd_unlock_bh_state(bh); |
765 | /* commit wakes up all shadow buffers after IO */ | 907 | wait_on_bit(&bh->b_state, BH_Shadow, |
766 | for ( ; ; ) { | 908 | sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); |
767 | prepare_to_wait(wqh, &wait.wait, | ||
768 | TASK_UNINTERRUPTIBLE); | ||
769 | if (jh->b_jlist != BJ_Shadow) | ||
770 | break; | ||
771 | schedule(); | ||
772 | } | ||
773 | finish_wait(wqh, &wait.wait); | ||
774 | goto repeat; | 909 | goto repeat; |
775 | } | 910 | } |
776 | 911 | ||
777 | /* Only do the copy if the currently-owning transaction | 912 | /* |
778 | * still needs it. If it is on the Forget list, the | 913 | * Only do the copy if the currently-owning transaction still |
779 | * committing transaction is past that stage. The | 914 | * needs it. If buffer isn't on BJ_Metadata list, the |
780 | * buffer had better remain locked during the kmalloc, | 915 | * committing transaction is past that stage (here we use the |
781 | * but that should be true --- we hold the journal lock | 916 | * fact that BH_Shadow is set under bh_state lock together with |
782 | * still and the buffer is already on the BUF_JOURNAL | 917 | * refiling to BJ_Shadow list and at this point we know the |
783 | * list so won't be flushed. | 918 | * buffer doesn't have BH_Shadow set). |
784 | * | 919 | * |
785 | * Subtle point, though: if this is a get_undo_access, | 920 | * Subtle point, though: if this is a get_undo_access, |
786 | * then we will be relying on the frozen_data to contain | 921 | * then we will be relying on the frozen_data to contain |
787 | * the new value of the committed_data record after the | 922 | * the new value of the committed_data record after the |
788 | * transaction, so we HAVE to force the frozen_data copy | 923 | * transaction, so we HAVE to force the frozen_data copy |
789 | * in that case. */ | 924 | * in that case. |
790 | 925 | */ | |
791 | if (jh->b_jlist != BJ_Forget || force_copy) { | 926 | if (jh->b_jlist == BJ_Metadata || force_copy) { |
792 | JBUFFER_TRACE(jh, "generate frozen data"); | 927 | JBUFFER_TRACE(jh, "generate frozen data"); |
793 | if (!frozen_buffer) { | 928 | if (!frozen_buffer) { |
794 | JBUFFER_TRACE(jh, "allocate memory for buffer"); | 929 | JBUFFER_TRACE(jh, "allocate memory for buffer"); |
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) | |||
915 | int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) | 1050 | int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) |
916 | { | 1051 | { |
917 | transaction_t *transaction = handle->h_transaction; | 1052 | transaction_t *transaction = handle->h_transaction; |
918 | journal_t *journal = transaction->t_journal; | 1053 | journal_t *journal; |
919 | struct journal_head *jh = jbd2_journal_add_journal_head(bh); | 1054 | struct journal_head *jh = jbd2_journal_add_journal_head(bh); |
920 | int err; | 1055 | int err; |
921 | 1056 | ||
922 | jbd_debug(5, "journal_head %p\n", jh); | 1057 | jbd_debug(5, "journal_head %p\n", jh); |
1058 | WARN_ON(!transaction); | ||
923 | err = -EROFS; | 1059 | err = -EROFS; |
924 | if (is_handle_aborted(handle)) | 1060 | if (is_handle_aborted(handle)) |
925 | goto out; | 1061 | goto out; |
1062 | journal = transaction->t_journal; | ||
926 | err = 0; | 1063 | err = 0; |
927 | 1064 | ||
928 | JBUFFER_TRACE(jh, "entry"); | 1065 | JBUFFER_TRACE(jh, "entry"); |
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, | |||
1128 | int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) | 1265 | int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) |
1129 | { | 1266 | { |
1130 | transaction_t *transaction = handle->h_transaction; | 1267 | transaction_t *transaction = handle->h_transaction; |
1131 | journal_t *journal = transaction->t_journal; | 1268 | journal_t *journal; |
1132 | struct journal_head *jh; | 1269 | struct journal_head *jh; |
1133 | int ret = 0; | 1270 | int ret = 0; |
1134 | 1271 | ||
1272 | WARN_ON(!transaction); | ||
1135 | if (is_handle_aborted(handle)) | 1273 | if (is_handle_aborted(handle)) |
1136 | goto out; | 1274 | return -EROFS; |
1275 | journal = transaction->t_journal; | ||
1137 | jh = jbd2_journal_grab_journal_head(bh); | 1276 | jh = jbd2_journal_grab_journal_head(bh); |
1138 | if (!jh) { | 1277 | if (!jh) { |
1139 | ret = -EUCLEAN; | 1278 | ret = -EUCLEAN; |
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) | |||
1227 | 1366 | ||
1228 | JBUFFER_TRACE(jh, "file as BJ_Metadata"); | 1367 | JBUFFER_TRACE(jh, "file as BJ_Metadata"); |
1229 | spin_lock(&journal->j_list_lock); | 1368 | spin_lock(&journal->j_list_lock); |
1230 | __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); | 1369 | __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); |
1231 | spin_unlock(&journal->j_list_lock); | 1370 | spin_unlock(&journal->j_list_lock); |
1232 | out_unlock_bh: | 1371 | out_unlock_bh: |
1233 | jbd_unlock_bh_state(bh); | 1372 | jbd_unlock_bh_state(bh); |
@@ -1258,12 +1397,17 @@ out: | |||
1258 | int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) | 1397 | int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) |
1259 | { | 1398 | { |
1260 | transaction_t *transaction = handle->h_transaction; | 1399 | transaction_t *transaction = handle->h_transaction; |
1261 | journal_t *journal = transaction->t_journal; | 1400 | journal_t *journal; |
1262 | struct journal_head *jh; | 1401 | struct journal_head *jh; |
1263 | int drop_reserve = 0; | 1402 | int drop_reserve = 0; |
1264 | int err = 0; | 1403 | int err = 0; |
1265 | int was_modified = 0; | 1404 | int was_modified = 0; |
1266 | 1405 | ||
1406 | WARN_ON(!transaction); | ||
1407 | if (is_handle_aborted(handle)) | ||
1408 | return -EROFS; | ||
1409 | journal = transaction->t_journal; | ||
1410 | |||
1267 | BUFFER_TRACE(bh, "entry"); | 1411 | BUFFER_TRACE(bh, "entry"); |
1268 | 1412 | ||
1269 | jbd_lock_bh_state(bh); | 1413 | jbd_lock_bh_state(bh); |
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) | |||
1290 | */ | 1434 | */ |
1291 | jh->b_modified = 0; | 1435 | jh->b_modified = 0; |
1292 | 1436 | ||
1293 | if (jh->b_transaction == handle->h_transaction) { | 1437 | if (jh->b_transaction == transaction) { |
1294 | J_ASSERT_JH(jh, !jh->b_frozen_data); | 1438 | J_ASSERT_JH(jh, !jh->b_frozen_data); |
1295 | 1439 | ||
1296 | /* If we are forgetting a buffer which is already part | 1440 | /* If we are forgetting a buffer which is already part |
@@ -1385,19 +1529,21 @@ drop: | |||
1385 | int jbd2_journal_stop(handle_t *handle) | 1529 | int jbd2_journal_stop(handle_t *handle) |
1386 | { | 1530 | { |
1387 | transaction_t *transaction = handle->h_transaction; | 1531 | transaction_t *transaction = handle->h_transaction; |
1388 | journal_t *journal = transaction->t_journal; | 1532 | journal_t *journal; |
1389 | int err, wait_for_commit = 0; | 1533 | int err = 0, wait_for_commit = 0; |
1390 | tid_t tid; | 1534 | tid_t tid; |
1391 | pid_t pid; | 1535 | pid_t pid; |
1392 | 1536 | ||
1537 | if (!transaction) | ||
1538 | goto free_and_exit; | ||
1539 | journal = transaction->t_journal; | ||
1540 | |||
1393 | J_ASSERT(journal_current_handle() == handle); | 1541 | J_ASSERT(journal_current_handle() == handle); |
1394 | 1542 | ||
1395 | if (is_handle_aborted(handle)) | 1543 | if (is_handle_aborted(handle)) |
1396 | err = -EIO; | 1544 | err = -EIO; |
1397 | else { | 1545 | else |
1398 | J_ASSERT(atomic_read(&transaction->t_updates) > 0); | 1546 | J_ASSERT(atomic_read(&transaction->t_updates) > 0); |
1399 | err = 0; | ||
1400 | } | ||
1401 | 1547 | ||
1402 | if (--handle->h_ref > 0) { | 1548 | if (--handle->h_ref > 0) { |
1403 | jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, | 1549 | jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, |
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle) | |||
1407 | 1553 | ||
1408 | jbd_debug(4, "Handle %p going down\n", handle); | 1554 | jbd_debug(4, "Handle %p going down\n", handle); |
1409 | trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, | 1555 | trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, |
1410 | handle->h_transaction->t_tid, | 1556 | transaction->t_tid, |
1411 | handle->h_type, handle->h_line_no, | 1557 | handle->h_type, handle->h_line_no, |
1412 | jiffies - handle->h_start_jiffies, | 1558 | jiffies - handle->h_start_jiffies, |
1413 | handle->h_sync, handle->h_requested_credits, | 1559 | handle->h_sync, handle->h_requested_credits, |
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle) | |||
1518 | 1664 | ||
1519 | lock_map_release(&handle->h_lockdep_map); | 1665 | lock_map_release(&handle->h_lockdep_map); |
1520 | 1666 | ||
1667 | if (handle->h_rsv_handle) | ||
1668 | jbd2_journal_free_reserved(handle->h_rsv_handle); | ||
1669 | free_and_exit: | ||
1521 | jbd2_free_handle(handle); | 1670 | jbd2_free_handle(handle); |
1522 | return err; | 1671 | return err; |
1523 | } | 1672 | } |
1524 | 1673 | ||
1525 | /** | ||
1526 | * int jbd2_journal_force_commit() - force any uncommitted transactions | ||
1527 | * @journal: journal to force | ||
1528 | * | ||
1529 | * For synchronous operations: force any uncommitted transactions | ||
1530 | * to disk. May seem kludgy, but it reuses all the handle batching | ||
1531 | * code in a very simple manner. | ||
1532 | */ | ||
1533 | int jbd2_journal_force_commit(journal_t *journal) | ||
1534 | { | ||
1535 | handle_t *handle; | ||
1536 | int ret; | ||
1537 | |||
1538 | handle = jbd2_journal_start(journal, 1); | ||
1539 | if (IS_ERR(handle)) { | ||
1540 | ret = PTR_ERR(handle); | ||
1541 | } else { | ||
1542 | handle->h_sync = 1; | ||
1543 | ret = jbd2_journal_stop(handle); | ||
1544 | } | ||
1545 | return ret; | ||
1546 | } | ||
1547 | |||
1548 | /* | 1674 | /* |
1549 | * | 1675 | * |
1550 | * List management code snippets: various functions for manipulating the | 1676 | * List management code snippets: various functions for manipulating the |
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) | |||
1601 | * Remove a buffer from the appropriate transaction list. | 1727 | * Remove a buffer from the appropriate transaction list. |
1602 | * | 1728 | * |
1603 | * Note that this function can *change* the value of | 1729 | * Note that this function can *change* the value of |
1604 | * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, | 1730 | * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or |
1605 | * t_log_list or t_reserved_list. If the caller is holding onto a copy of one | 1731 | * t_reserved_list. If the caller is holding onto a copy of one of these |
1606 | * of these pointers, it could go bad. Generally the caller needs to re-read | 1732 | * pointers, it could go bad. Generally the caller needs to re-read the |
1607 | * the pointer from the transaction_t. | 1733 | * pointer from the transaction_t. |
1608 | * | 1734 | * |
1609 | * Called under j_list_lock. | 1735 | * Called under j_list_lock. |
1610 | */ | 1736 | */ |
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) | |||
1634 | case BJ_Forget: | 1760 | case BJ_Forget: |
1635 | list = &transaction->t_forget; | 1761 | list = &transaction->t_forget; |
1636 | break; | 1762 | break; |
1637 | case BJ_IO: | ||
1638 | list = &transaction->t_iobuf_list; | ||
1639 | break; | ||
1640 | case BJ_Shadow: | 1763 | case BJ_Shadow: |
1641 | list = &transaction->t_shadow_list; | 1764 | list = &transaction->t_shadow_list; |
1642 | break; | 1765 | break; |
1643 | case BJ_LogCtl: | ||
1644 | list = &transaction->t_log_list; | ||
1645 | break; | ||
1646 | case BJ_Reserved: | 1766 | case BJ_Reserved: |
1647 | list = &transaction->t_reserved_list; | 1767 | list = &transaction->t_reserved_list; |
1648 | break; | 1768 | break; |
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked: | |||
2034 | * void jbd2_journal_invalidatepage() | 2154 | * void jbd2_journal_invalidatepage() |
2035 | * @journal: journal to use for flush... | 2155 | * @journal: journal to use for flush... |
2036 | * @page: page to flush | 2156 | * @page: page to flush |
2037 | * @offset: length of page to invalidate. | 2157 | * @offset: start of the range to invalidate |
2158 | * @length: length of the range to invalidate | ||
2038 | * | 2159 | * |
2039 | * Reap page buffers containing data after offset in page. Can return -EBUSY | 2160 | * Reap page buffers containing data after in the specified range in page. |
2040 | * if buffers are part of the committing transaction and the page is straddling | 2161 | * Can return -EBUSY if buffers are part of the committing transaction and |
2041 | * i_size. Caller then has to wait for current commit and try again. | 2162 | * the page is straddling i_size. Caller then has to wait for current commit |
2163 | * and try again. | ||
2042 | */ | 2164 | */ |
2043 | int jbd2_journal_invalidatepage(journal_t *journal, | 2165 | int jbd2_journal_invalidatepage(journal_t *journal, |
2044 | struct page *page, | 2166 | struct page *page, |
2045 | unsigned long offset) | 2167 | unsigned int offset, |
2168 | unsigned int length) | ||
2046 | { | 2169 | { |
2047 | struct buffer_head *head, *bh, *next; | 2170 | struct buffer_head *head, *bh, *next; |
2171 | unsigned int stop = offset + length; | ||
2048 | unsigned int curr_off = 0; | 2172 | unsigned int curr_off = 0; |
2173 | int partial_page = (offset || length < PAGE_CACHE_SIZE); | ||
2049 | int may_free = 1; | 2174 | int may_free = 1; |
2050 | int ret = 0; | 2175 | int ret = 0; |
2051 | 2176 | ||
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal, | |||
2054 | if (!page_has_buffers(page)) | 2179 | if (!page_has_buffers(page)) |
2055 | return 0; | 2180 | return 0; |
2056 | 2181 | ||
2182 | BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); | ||
2183 | |||
2057 | /* We will potentially be playing with lists other than just the | 2184 | /* We will potentially be playing with lists other than just the |
2058 | * data lists (especially for journaled data mode), so be | 2185 | * data lists (especially for journaled data mode), so be |
2059 | * cautious in our locking. */ | 2186 | * cautious in our locking. */ |
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal, | |||
2063 | unsigned int next_off = curr_off + bh->b_size; | 2190 | unsigned int next_off = curr_off + bh->b_size; |
2064 | next = bh->b_this_page; | 2191 | next = bh->b_this_page; |
2065 | 2192 | ||
2193 | if (next_off > stop) | ||
2194 | return 0; | ||
2195 | |||
2066 | if (offset <= curr_off) { | 2196 | if (offset <= curr_off) { |
2067 | /* This block is wholly outside the truncation point */ | 2197 | /* This block is wholly outside the truncation point */ |
2068 | lock_buffer(bh); | 2198 | lock_buffer(bh); |
2069 | ret = journal_unmap_buffer(journal, bh, offset > 0); | 2199 | ret = journal_unmap_buffer(journal, bh, partial_page); |
2070 | unlock_buffer(bh); | 2200 | unlock_buffer(bh); |
2071 | if (ret < 0) | 2201 | if (ret < 0) |
2072 | return ret; | 2202 | return ret; |
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal, | |||
2077 | 2207 | ||
2078 | } while (bh != head); | 2208 | } while (bh != head); |
2079 | 2209 | ||
2080 | if (!offset) { | 2210 | if (!partial_page) { |
2081 | if (may_free && try_to_free_buffers(page)) | 2211 | if (may_free && try_to_free_buffers(page)) |
2082 | J_ASSERT(!page_has_buffers(page)); | 2212 | J_ASSERT(!page_has_buffers(page)); |
2083 | } | 2213 | } |
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, | |||
2138 | case BJ_Forget: | 2268 | case BJ_Forget: |
2139 | list = &transaction->t_forget; | 2269 | list = &transaction->t_forget; |
2140 | break; | 2270 | break; |
2141 | case BJ_IO: | ||
2142 | list = &transaction->t_iobuf_list; | ||
2143 | break; | ||
2144 | case BJ_Shadow: | 2271 | case BJ_Shadow: |
2145 | list = &transaction->t_shadow_list; | 2272 | list = &transaction->t_shadow_list; |
2146 | break; | 2273 | break; |
2147 | case BJ_LogCtl: | ||
2148 | list = &transaction->t_log_list; | ||
2149 | break; | ||
2150 | case BJ_Reserved: | 2274 | case BJ_Reserved: |
2151 | list = &transaction->t_reserved_list; | 2275 | list = &transaction->t_reserved_list; |
2152 | break; | 2276 | break; |
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) | |||
2248 | int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) | 2372 | int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) |
2249 | { | 2373 | { |
2250 | transaction_t *transaction = handle->h_transaction; | 2374 | transaction_t *transaction = handle->h_transaction; |
2251 | journal_t *journal = transaction->t_journal; | 2375 | journal_t *journal; |
2252 | 2376 | ||
2377 | WARN_ON(!transaction); | ||
2253 | if (is_handle_aborted(handle)) | 2378 | if (is_handle_aborted(handle)) |
2254 | return -EIO; | 2379 | return -EROFS; |
2380 | journal = transaction->t_journal; | ||
2255 | 2381 | ||
2256 | jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, | 2382 | jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, |
2257 | transaction->t_tid); | 2383 | transaction->t_tid); |
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 6740d34cd82b..9e3aaff11f89 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c | |||
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) | |||
571 | return ret; | 571 | return ret; |
572 | } | 572 | } |
573 | 573 | ||
574 | static void metapage_invalidatepage(struct page *page, unsigned long offset) | 574 | static void metapage_invalidatepage(struct page *page, unsigned int offset, |
575 | unsigned int length) | ||
575 | { | 576 | { |
576 | BUG_ON(offset); | 577 | BUG_ON(offset || length < PAGE_CACHE_SIZE); |
577 | 578 | ||
578 | BUG_ON(PageWriteback(page)); | 579 | BUG_ON(PageWriteback(page)); |
579 | 580 | ||
diff --git a/fs/logfs/file.c b/fs/logfs/file.c index c2219a6dd3c8..57914fc32b62 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c | |||
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc) | |||
159 | return __logfs_writepage(page); | 159 | return __logfs_writepage(page); |
160 | } | 160 | } |
161 | 161 | ||
162 | static void logfs_invalidatepage(struct page *page, unsigned long offset) | 162 | static void logfs_invalidatepage(struct page *page, unsigned int offset, |
163 | unsigned int length) | ||
163 | { | 164 | { |
164 | struct logfs_block *block = logfs_block(page); | 165 | struct logfs_block *block = logfs_block(page); |
165 | 166 | ||
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 038da0991794..d448a777166b 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c | |||
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb) | |||
884 | return area; | 884 | return area; |
885 | } | 885 | } |
886 | 886 | ||
887 | static void map_invalidatepage(struct page *page, unsigned long l) | 887 | static void map_invalidatepage(struct page *page, unsigned int o, |
888 | unsigned int l) | ||
888 | { | 889 | { |
889 | return; | 890 | return; |
890 | } | 891 | } |
diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a87a44f84113..6b4a79f4ad1d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c | |||
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, | |||
451 | * - Called if either PG_private or PG_fscache is set on the page | 451 | * - Called if either PG_private or PG_fscache is set on the page |
452 | * - Caller holds page lock | 452 | * - Caller holds page lock |
453 | */ | 453 | */ |
454 | static void nfs_invalidate_page(struct page *page, unsigned long offset) | 454 | static void nfs_invalidate_page(struct page *page, unsigned int offset, |
455 | unsigned int length) | ||
455 | { | 456 | { |
456 | dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); | 457 | dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", |
458 | page, offset, length); | ||
457 | 459 | ||
458 | if (offset != 0) | 460 | if (offset != 0 || length < PAGE_CACHE_SIZE) |
459 | return; | 461 | return; |
460 | /* Cancel any unstarted writes on this page */ | 462 | /* Cancel any unstarted writes on this page */ |
461 | nfs_wb_page_cancel(page_file_mapping(page)->host, page); | 463 | nfs_wb_page_cancel(page_file_mapping(page)->host, page); |
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index fa9c05f97af4..d267ea6aa1a0 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c | |||
@@ -1372,7 +1372,7 @@ retry_writepage: | |||
1372 | * The page may have dirty, unmapped buffers. Make them | 1372 | * The page may have dirty, unmapped buffers. Make them |
1373 | * freeable here, so the page does not leak. | 1373 | * freeable here, so the page does not leak. |
1374 | */ | 1374 | */ |
1375 | block_invalidatepage(page, 0); | 1375 | block_invalidatepage(page, 0, PAGE_CACHE_SIZE); |
1376 | unlock_page(page); | 1376 | unlock_page(page); |
1377 | ntfs_debug("Write outside i_size - truncated?"); | 1377 | ntfs_debug("Write outside i_size - truncated?"); |
1378 | return 0; | 1378 | return 0; |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 20dfec72e903..79736a28d84f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
603 | * from ext3. PageChecked() bits have been removed as OCFS2 does not | 603 | * from ext3. PageChecked() bits have been removed as OCFS2 does not |
604 | * do journalled data. | 604 | * do journalled data. |
605 | */ | 605 | */ |
606 | static void ocfs2_invalidatepage(struct page *page, unsigned long offset) | 606 | static void ocfs2_invalidatepage(struct page *page, unsigned int offset, |
607 | unsigned int length) | ||
607 | { | 608 | { |
608 | journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; | 609 | journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; |
609 | 610 | ||
610 | jbd2_journal_invalidatepage(journal, page, offset); | 611 | jbd2_journal_invalidatepage(journal, page, offset, length); |
611 | } | 612 | } |
612 | 613 | ||
613 | static int ocfs2_releasepage(struct page *page, gfp_t wait) | 614 | static int ocfs2_releasepage(struct page *page, gfp_t wait) |
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f844533792ee..0048cc16a6a8 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c | |||
@@ -2975,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) | |||
2975 | } | 2975 | } |
2976 | 2976 | ||
2977 | /* clm -- taken from fs/buffer.c:block_invalidate_page */ | 2977 | /* clm -- taken from fs/buffer.c:block_invalidate_page */ |
2978 | static void reiserfs_invalidatepage(struct page *page, unsigned long offset) | 2978 | static void reiserfs_invalidatepage(struct page *page, unsigned int offset, |
2979 | unsigned int length) | ||
2979 | { | 2980 | { |
2980 | struct buffer_head *head, *bh, *next; | 2981 | struct buffer_head *head, *bh, *next; |
2981 | struct inode *inode = page->mapping->host; | 2982 | struct inode *inode = page->mapping->host; |
2982 | unsigned int curr_off = 0; | 2983 | unsigned int curr_off = 0; |
2984 | unsigned int stop = offset + length; | ||
2985 | int partial_page = (offset || length < PAGE_CACHE_SIZE); | ||
2983 | int ret = 1; | 2986 | int ret = 1; |
2984 | 2987 | ||
2985 | BUG_ON(!PageLocked(page)); | 2988 | BUG_ON(!PageLocked(page)); |
2986 | 2989 | ||
2987 | if (offset == 0) | 2990 | if (!partial_page) |
2988 | ClearPageChecked(page); | 2991 | ClearPageChecked(page); |
2989 | 2992 | ||
2990 | if (!page_has_buffers(page)) | 2993 | if (!page_has_buffers(page)) |
@@ -2996,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset) | |||
2996 | unsigned int next_off = curr_off + bh->b_size; | 2999 | unsigned int next_off = curr_off + bh->b_size; |
2997 | next = bh->b_this_page; | 3000 | next = bh->b_this_page; |
2998 | 3001 | ||
3002 | if (next_off > stop) | ||
3003 | goto out; | ||
3004 | |||
2999 | /* | 3005 | /* |
3000 | * is this block fully invalidated? | 3006 | * is this block fully invalidated? |
3001 | */ | 3007 | */ |
@@ -3014,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset) | |||
3014 | * The get_block cached value has been unconditionally invalidated, | 3020 | * The get_block cached value has been unconditionally invalidated, |
3015 | * so real IO is not possible anymore. | 3021 | * so real IO is not possible anymore. |
3016 | */ | 3022 | */ |
3017 | if (!offset && ret) { | 3023 | if (!partial_page && ret) { |
3018 | ret = try_to_release_page(page, 0); | 3024 | ret = try_to_release_page(page, 0); |
3019 | /* maybe should BUG_ON(!ret); - neilb */ | 3025 | /* maybe should BUG_ON(!ret); - neilb */ |
3020 | } | 3026 | } |
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 14374530784c..123c79b7261e 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c | |||
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) | |||
1277 | return err; | 1277 | return err; |
1278 | } | 1278 | } |
1279 | 1279 | ||
1280 | static void ubifs_invalidatepage(struct page *page, unsigned long offset) | 1280 | static void ubifs_invalidatepage(struct page *page, unsigned int offset, |
1281 | unsigned int length) | ||
1281 | { | 1282 | { |
1282 | struct inode *inode = page->mapping->host; | 1283 | struct inode *inode = page->mapping->host; |
1283 | struct ubifs_info *c = inode->i_sb->s_fs_info; | 1284 | struct ubifs_info *c = inode->i_sb->s_fs_info; |
1284 | 1285 | ||
1285 | ubifs_assert(PagePrivate(page)); | 1286 | ubifs_assert(PagePrivate(page)); |
1286 | if (offset) | 1287 | if (offset || length < PAGE_CACHE_SIZE) |
1287 | /* Partial page remains dirty */ | 1288 | /* Partial page remains dirty */ |
1288 | return; | 1289 | return; |
1289 | 1290 | ||
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 41a695048be7..596ec71da00e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
@@ -843,10 +843,12 @@ xfs_cluster_write( | |||
843 | STATIC void | 843 | STATIC void |
844 | xfs_vm_invalidatepage( | 844 | xfs_vm_invalidatepage( |
845 | struct page *page, | 845 | struct page *page, |
846 | unsigned long offset) | 846 | unsigned int offset, |
847 | unsigned int length) | ||
847 | { | 848 | { |
848 | trace_xfs_invalidatepage(page->mapping->host, page, offset); | 849 | trace_xfs_invalidatepage(page->mapping->host, page, offset, |
849 | block_invalidatepage(page, offset); | 850 | length); |
851 | block_invalidatepage(page, offset, length); | ||
850 | } | 852 | } |
851 | 853 | ||
852 | /* | 854 | /* |
@@ -910,7 +912,7 @@ next_buffer: | |||
910 | 912 | ||
911 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 913 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
912 | out_invalidate: | 914 | out_invalidate: |
913 | xfs_vm_invalidatepage(page, 0); | 915 | xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); |
914 | return; | 916 | return; |
915 | } | 917 | } |
916 | 918 | ||
@@ -940,7 +942,7 @@ xfs_vm_writepage( | |||
940 | int count = 0; | 942 | int count = 0; |
941 | int nonblocking = 0; | 943 | int nonblocking = 0; |
942 | 944 | ||
943 | trace_xfs_writepage(inode, page, 0); | 945 | trace_xfs_writepage(inode, page, 0, 0); |
944 | 946 | ||
945 | ASSERT(page_has_buffers(page)); | 947 | ASSERT(page_has_buffers(page)); |
946 | 948 | ||
@@ -1171,7 +1173,7 @@ xfs_vm_releasepage( | |||
1171 | { | 1173 | { |
1172 | int delalloc, unwritten; | 1174 | int delalloc, unwritten; |
1173 | 1175 | ||
1174 | trace_xfs_releasepage(page->mapping->host, page, 0); | 1176 | trace_xfs_releasepage(page->mapping->host, page, 0, 0); |
1175 | 1177 | ||
1176 | xfs_count_page_state(page, &delalloc, &unwritten); | 1178 | xfs_count_page_state(page, &delalloc, &unwritten); |
1177 | 1179 | ||
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index aa4db3307d36..a04701de6bbd 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h | |||
@@ -974,14 +974,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read); | |||
974 | DEFINE_RW_EVENT(xfs_file_splice_write); | 974 | DEFINE_RW_EVENT(xfs_file_splice_write); |
975 | 975 | ||
976 | DECLARE_EVENT_CLASS(xfs_page_class, | 976 | DECLARE_EVENT_CLASS(xfs_page_class, |
977 | TP_PROTO(struct inode *inode, struct page *page, unsigned long off), | 977 | TP_PROTO(struct inode *inode, struct page *page, unsigned long off, |
978 | TP_ARGS(inode, page, off), | 978 | unsigned int len), |
979 | TP_ARGS(inode, page, off, len), | ||
979 | TP_STRUCT__entry( | 980 | TP_STRUCT__entry( |
980 | __field(dev_t, dev) | 981 | __field(dev_t, dev) |
981 | __field(xfs_ino_t, ino) | 982 | __field(xfs_ino_t, ino) |
982 | __field(pgoff_t, pgoff) | 983 | __field(pgoff_t, pgoff) |
983 | __field(loff_t, size) | 984 | __field(loff_t, size) |
984 | __field(unsigned long, offset) | 985 | __field(unsigned long, offset) |
986 | __field(unsigned int, length) | ||
985 | __field(int, delalloc) | 987 | __field(int, delalloc) |
986 | __field(int, unwritten) | 988 | __field(int, unwritten) |
987 | ), | 989 | ), |
@@ -995,24 +997,27 @@ DECLARE_EVENT_CLASS(xfs_page_class, | |||
995 | __entry->pgoff = page_offset(page); | 997 | __entry->pgoff = page_offset(page); |
996 | __entry->size = i_size_read(inode); | 998 | __entry->size = i_size_read(inode); |
997 | __entry->offset = off; | 999 | __entry->offset = off; |
1000 | __entry->length = len; | ||
998 | __entry->delalloc = delalloc; | 1001 | __entry->delalloc = delalloc; |
999 | __entry->unwritten = unwritten; | 1002 | __entry->unwritten = unwritten; |
1000 | ), | 1003 | ), |
1001 | TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " | 1004 | TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " |
1002 | "delalloc %d unwritten %d", | 1005 | "length %x delalloc %d unwritten %d", |
1003 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1006 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1004 | __entry->ino, | 1007 | __entry->ino, |
1005 | __entry->pgoff, | 1008 | __entry->pgoff, |
1006 | __entry->size, | 1009 | __entry->size, |
1007 | __entry->offset, | 1010 | __entry->offset, |
1011 | __entry->length, | ||
1008 | __entry->delalloc, | 1012 | __entry->delalloc, |
1009 | __entry->unwritten) | 1013 | __entry->unwritten) |
1010 | ) | 1014 | ) |
1011 | 1015 | ||
1012 | #define DEFINE_PAGE_EVENT(name) \ | 1016 | #define DEFINE_PAGE_EVENT(name) \ |
1013 | DEFINE_EVENT(xfs_page_class, name, \ | 1017 | DEFINE_EVENT(xfs_page_class, name, \ |
1014 | TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ | 1018 | TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ |
1015 | TP_ARGS(inode, page, off)) | 1019 | unsigned int len), \ |
1020 | TP_ARGS(inode, page, off, len)) | ||
1016 | DEFINE_PAGE_EVENT(xfs_writepage); | 1021 | DEFINE_PAGE_EVENT(xfs_writepage); |
1017 | DEFINE_PAGE_EVENT(xfs_releasepage); | 1022 | DEFINE_PAGE_EVENT(xfs_releasepage); |
1018 | DEFINE_PAGE_EVENT(xfs_invalidatepage); | 1023 | DEFINE_PAGE_EVENT(xfs_invalidatepage); |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 9e52b0626b39..f5a3b838ddb0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -198,7 +198,8 @@ extern int buffer_heads_over_limit; | |||
198 | * Generic address_space_operations implementations for buffer_head-backed | 198 | * Generic address_space_operations implementations for buffer_head-backed |
199 | * address_spaces. | 199 | * address_spaces. |
200 | */ | 200 | */ |
201 | void block_invalidatepage(struct page *page, unsigned long offset); | 201 | void block_invalidatepage(struct page *page, unsigned int offset, |
202 | unsigned int length); | ||
202 | int block_write_full_page(struct page *page, get_block_t *get_block, | 203 | int block_write_full_page(struct page *page, get_block_t *get_block, |
203 | struct writeback_control *wbc); | 204 | struct writeback_control *wbc); |
204 | int block_write_full_page_endio(struct page *page, get_block_t *get_block, | 205 | int block_write_full_page_endio(struct page *page, get_block_t *get_block, |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 7c30e3a62baf..f8a5240541b7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -364,7 +364,7 @@ struct address_space_operations { | |||
364 | 364 | ||
365 | /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ | 365 | /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ |
366 | sector_t (*bmap)(struct address_space *, sector_t); | 366 | sector_t (*bmap)(struct address_space *, sector_t); |
367 | void (*invalidatepage) (struct page *, unsigned long); | 367 | void (*invalidatepage) (struct page *, unsigned int, unsigned int); |
368 | int (*releasepage) (struct page *, gfp_t); | 368 | int (*releasepage) (struct page *, gfp_t); |
369 | void (*freepage)(struct page *); | 369 | void (*freepage)(struct page *); |
370 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 370 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 7e0b622503c4..8685d1be12c7 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h | |||
@@ -27,7 +27,6 @@ | |||
27 | #include <linux/buffer_head.h> | 27 | #include <linux/buffer_head.h> |
28 | #include <linux/journal-head.h> | 28 | #include <linux/journal-head.h> |
29 | #include <linux/stddef.h> | 29 | #include <linux/stddef.h> |
30 | #include <linux/bit_spinlock.h> | ||
31 | #include <linux/mutex.h> | 30 | #include <linux/mutex.h> |
32 | #include <linux/timer.h> | 31 | #include <linux/timer.h> |
33 | #include <linux/lockdep.h> | 32 | #include <linux/lockdep.h> |
@@ -244,6 +243,31 @@ typedef struct journal_superblock_s | |||
244 | 243 | ||
245 | #include <linux/fs.h> | 244 | #include <linux/fs.h> |
246 | #include <linux/sched.h> | 245 | #include <linux/sched.h> |
246 | |||
247 | enum jbd_state_bits { | ||
248 | BH_JBD /* Has an attached ext3 journal_head */ | ||
249 | = BH_PrivateStart, | ||
250 | BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ | ||
251 | BH_Freed, /* Has been freed (truncated) */ | ||
252 | BH_Revoked, /* Has been revoked from the log */ | ||
253 | BH_RevokeValid, /* Revoked flag is valid */ | ||
254 | BH_JBDDirty, /* Is dirty but journaled */ | ||
255 | BH_State, /* Pins most journal_head state */ | ||
256 | BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ | ||
257 | BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ | ||
258 | BH_JBDPrivateStart, /* First bit available for private use by FS */ | ||
259 | }; | ||
260 | |||
261 | BUFFER_FNS(JBD, jbd) | ||
262 | BUFFER_FNS(JWrite, jwrite) | ||
263 | BUFFER_FNS(JBDDirty, jbddirty) | ||
264 | TAS_BUFFER_FNS(JBDDirty, jbddirty) | ||
265 | BUFFER_FNS(Revoked, revoked) | ||
266 | TAS_BUFFER_FNS(Revoked, revoked) | ||
267 | BUFFER_FNS(RevokeValid, revokevalid) | ||
268 | TAS_BUFFER_FNS(RevokeValid, revokevalid) | ||
269 | BUFFER_FNS(Freed, freed) | ||
270 | |||
247 | #include <linux/jbd_common.h> | 271 | #include <linux/jbd_common.h> |
248 | 272 | ||
249 | #define J_ASSERT(assert) BUG_ON(!(assert)) | 273 | #define J_ASSERT(assert) BUG_ON(!(assert)) |
@@ -840,7 +864,7 @@ extern void journal_release_buffer (handle_t *, struct buffer_head *); | |||
840 | extern int journal_forget (handle_t *, struct buffer_head *); | 864 | extern int journal_forget (handle_t *, struct buffer_head *); |
841 | extern void journal_sync_buffer (struct buffer_head *); | 865 | extern void journal_sync_buffer (struct buffer_head *); |
842 | extern void journal_invalidatepage(journal_t *, | 866 | extern void journal_invalidatepage(journal_t *, |
843 | struct page *, unsigned long); | 867 | struct page *, unsigned int, unsigned int); |
844 | extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); | 868 | extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); |
845 | extern int journal_stop(handle_t *); | 869 | extern int journal_stop(handle_t *); |
846 | extern int journal_flush (journal_t *); | 870 | extern int journal_flush (journal_t *); |
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 6e051f472edb..d5b50a19463c 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h | |||
@@ -26,7 +26,6 @@ | |||
26 | #include <linux/buffer_head.h> | 26 | #include <linux/buffer_head.h> |
27 | #include <linux/journal-head.h> | 27 | #include <linux/journal-head.h> |
28 | #include <linux/stddef.h> | 28 | #include <linux/stddef.h> |
29 | #include <linux/bit_spinlock.h> | ||
30 | #include <linux/mutex.h> | 29 | #include <linux/mutex.h> |
31 | #include <linux/timer.h> | 30 | #include <linux/timer.h> |
32 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
@@ -57,17 +56,13 @@ | |||
57 | */ | 56 | */ |
58 | #define JBD2_EXPENSIVE_CHECKING | 57 | #define JBD2_EXPENSIVE_CHECKING |
59 | extern ushort jbd2_journal_enable_debug; | 58 | extern ushort jbd2_journal_enable_debug; |
59 | void __jbd2_debug(int level, const char *file, const char *func, | ||
60 | unsigned int line, const char *fmt, ...); | ||
60 | 61 | ||
61 | #define jbd_debug(n, f, a...) \ | 62 | #define jbd_debug(n, fmt, a...) \ |
62 | do { \ | 63 | __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) |
63 | if ((n) <= jbd2_journal_enable_debug) { \ | ||
64 | printk (KERN_DEBUG "(%s, %d): %s: ", \ | ||
65 | __FILE__, __LINE__, __func__); \ | ||
66 | printk (f, ## a); \ | ||
67 | } \ | ||
68 | } while (0) | ||
69 | #else | 64 | #else |
70 | #define jbd_debug(f, a...) /**/ | 65 | #define jbd_debug(n, fmt, a...) /**/ |
71 | #endif | 66 | #endif |
72 | 67 | ||
73 | extern void *jbd2_alloc(size_t size, gfp_t flags); | 68 | extern void *jbd2_alloc(size_t size, gfp_t flags); |
@@ -302,6 +297,34 @@ typedef struct journal_superblock_s | |||
302 | 297 | ||
303 | #include <linux/fs.h> | 298 | #include <linux/fs.h> |
304 | #include <linux/sched.h> | 299 | #include <linux/sched.h> |
300 | |||
301 | enum jbd_state_bits { | ||
302 | BH_JBD /* Has an attached ext3 journal_head */ | ||
303 | = BH_PrivateStart, | ||
304 | BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ | ||
305 | BH_Freed, /* Has been freed (truncated) */ | ||
306 | BH_Revoked, /* Has been revoked from the log */ | ||
307 | BH_RevokeValid, /* Revoked flag is valid */ | ||
308 | BH_JBDDirty, /* Is dirty but journaled */ | ||
309 | BH_State, /* Pins most journal_head state */ | ||
310 | BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ | ||
311 | BH_Shadow, /* IO on shadow buffer is running */ | ||
312 | BH_Verified, /* Metadata block has been verified ok */ | ||
313 | BH_JBDPrivateStart, /* First bit available for private use by FS */ | ||
314 | }; | ||
315 | |||
316 | BUFFER_FNS(JBD, jbd) | ||
317 | BUFFER_FNS(JWrite, jwrite) | ||
318 | BUFFER_FNS(JBDDirty, jbddirty) | ||
319 | TAS_BUFFER_FNS(JBDDirty, jbddirty) | ||
320 | BUFFER_FNS(Revoked, revoked) | ||
321 | TAS_BUFFER_FNS(Revoked, revoked) | ||
322 | BUFFER_FNS(RevokeValid, revokevalid) | ||
323 | TAS_BUFFER_FNS(RevokeValid, revokevalid) | ||
324 | BUFFER_FNS(Freed, freed) | ||
325 | BUFFER_FNS(Shadow, shadow) | ||
326 | BUFFER_FNS(Verified, verified) | ||
327 | |||
305 | #include <linux/jbd_common.h> | 328 | #include <linux/jbd_common.h> |
306 | 329 | ||
307 | #define J_ASSERT(assert) BUG_ON(!(assert)) | 330 | #define J_ASSERT(assert) BUG_ON(!(assert)) |
@@ -382,8 +405,15 @@ struct jbd2_revoke_table_s; | |||
382 | 405 | ||
383 | struct jbd2_journal_handle | 406 | struct jbd2_journal_handle |
384 | { | 407 | { |
385 | /* Which compound transaction is this update a part of? */ | 408 | union { |
386 | transaction_t *h_transaction; | 409 | /* Which compound transaction is this update a part of? */ |
410 | transaction_t *h_transaction; | ||
411 | /* Which journal handle belongs to - used iff h_reserved set */ | ||
412 | journal_t *h_journal; | ||
413 | }; | ||
414 | |||
415 | /* Handle reserved for finishing the logical operation */ | ||
416 | handle_t *h_rsv_handle; | ||
387 | 417 | ||
388 | /* Number of remaining buffers we are allowed to dirty: */ | 418 | /* Number of remaining buffers we are allowed to dirty: */ |
389 | int h_buffer_credits; | 419 | int h_buffer_credits; |
@@ -398,6 +428,7 @@ struct jbd2_journal_handle | |||
398 | /* Flags [no locking] */ | 428 | /* Flags [no locking] */ |
399 | unsigned int h_sync: 1; /* sync-on-close */ | 429 | unsigned int h_sync: 1; /* sync-on-close */ |
400 | unsigned int h_jdata: 1; /* force data journaling */ | 430 | unsigned int h_jdata: 1; /* force data journaling */ |
431 | unsigned int h_reserved: 1; /* handle with reserved credits */ | ||
401 | unsigned int h_aborted: 1; /* fatal error on handle */ | 432 | unsigned int h_aborted: 1; /* fatal error on handle */ |
402 | unsigned int h_type: 8; /* for handle statistics */ | 433 | unsigned int h_type: 8; /* for handle statistics */ |
403 | unsigned int h_line_no: 16; /* for handle statistics */ | 434 | unsigned int h_line_no: 16; /* for handle statistics */ |
@@ -524,12 +555,6 @@ struct transaction_s | |||
524 | struct journal_head *t_checkpoint_io_list; | 555 | struct journal_head *t_checkpoint_io_list; |
525 | 556 | ||
526 | /* | 557 | /* |
527 | * Doubly-linked circular list of temporary buffers currently undergoing | ||
528 | * IO in the log [j_list_lock] | ||
529 | */ | ||
530 | struct journal_head *t_iobuf_list; | ||
531 | |||
532 | /* | ||
533 | * Doubly-linked circular list of metadata buffers being shadowed by log | 558 | * Doubly-linked circular list of metadata buffers being shadowed by log |
534 | * IO. The IO buffers on the iobuf list and the shadow buffers on this | 559 | * IO. The IO buffers on the iobuf list and the shadow buffers on this |
535 | * list match each other one for one at all times. [j_list_lock] | 560 | * list match each other one for one at all times. [j_list_lock] |
@@ -537,12 +562,6 @@ struct transaction_s | |||
537 | struct journal_head *t_shadow_list; | 562 | struct journal_head *t_shadow_list; |
538 | 563 | ||
539 | /* | 564 | /* |
540 | * Doubly-linked circular list of control buffers being written to the | ||
541 | * log. [j_list_lock] | ||
542 | */ | ||
543 | struct journal_head *t_log_list; | ||
544 | |||
545 | /* | ||
546 | * List of inodes whose data we've modified in data=ordered mode. | 565 | * List of inodes whose data we've modified in data=ordered mode. |
547 | * [j_list_lock] | 566 | * [j_list_lock] |
548 | */ | 567 | */ |
@@ -671,11 +690,10 @@ jbd2_time_diff(unsigned long start, unsigned long end) | |||
671 | * waiting for checkpointing | 690 | * waiting for checkpointing |
672 | * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction | 691 | * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction |
673 | * to start committing, or for a barrier lock to be released | 692 | * to start committing, or for a barrier lock to be released |
674 | * @j_wait_logspace: Wait queue for waiting for checkpointing to complete | ||
675 | * @j_wait_done_commit: Wait queue for waiting for commit to complete | 693 | * @j_wait_done_commit: Wait queue for waiting for commit to complete |
676 | * @j_wait_checkpoint: Wait queue to trigger checkpointing | ||
677 | * @j_wait_commit: Wait queue to trigger commit | 694 | * @j_wait_commit: Wait queue to trigger commit |
678 | * @j_wait_updates: Wait queue to wait for updates to complete | 695 | * @j_wait_updates: Wait queue to wait for updates to complete |
696 | * @j_wait_reserved: Wait queue to wait for reserved buffer credits to drop | ||
679 | * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints | 697 | * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints |
680 | * @j_head: Journal head - identifies the first unused block in the journal | 698 | * @j_head: Journal head - identifies the first unused block in the journal |
681 | * @j_tail: Journal tail - identifies the oldest still-used block in the | 699 | * @j_tail: Journal tail - identifies the oldest still-used block in the |
@@ -689,6 +707,7 @@ jbd2_time_diff(unsigned long start, unsigned long end) | |||
689 | * journal | 707 | * journal |
690 | * @j_fs_dev: Device which holds the client fs. For internal journal this will | 708 | * @j_fs_dev: Device which holds the client fs. For internal journal this will |
691 | * be equal to j_dev | 709 | * be equal to j_dev |
710 | * @j_reserved_credits: Number of buffers reserved from the running transaction | ||
692 | * @j_maxlen: Total maximum capacity of the journal region on disk. | 711 | * @j_maxlen: Total maximum capacity of the journal region on disk. |
693 | * @j_list_lock: Protects the buffer lists and internal buffer state. | 712 | * @j_list_lock: Protects the buffer lists and internal buffer state. |
694 | * @j_inode: Optional inode where we store the journal. If present, all journal | 713 | * @j_inode: Optional inode where we store the journal. If present, all journal |
@@ -778,21 +797,18 @@ struct journal_s | |||
778 | */ | 797 | */ |
779 | wait_queue_head_t j_wait_transaction_locked; | 798 | wait_queue_head_t j_wait_transaction_locked; |
780 | 799 | ||
781 | /* Wait queue for waiting for checkpointing to complete */ | ||
782 | wait_queue_head_t j_wait_logspace; | ||
783 | |||
784 | /* Wait queue for waiting for commit to complete */ | 800 | /* Wait queue for waiting for commit to complete */ |
785 | wait_queue_head_t j_wait_done_commit; | 801 | wait_queue_head_t j_wait_done_commit; |
786 | 802 | ||
787 | /* Wait queue to trigger checkpointing */ | ||
788 | wait_queue_head_t j_wait_checkpoint; | ||
789 | |||
790 | /* Wait queue to trigger commit */ | 803 | /* Wait queue to trigger commit */ |
791 | wait_queue_head_t j_wait_commit; | 804 | wait_queue_head_t j_wait_commit; |
792 | 805 | ||
793 | /* Wait queue to wait for updates to complete */ | 806 | /* Wait queue to wait for updates to complete */ |
794 | wait_queue_head_t j_wait_updates; | 807 | wait_queue_head_t j_wait_updates; |
795 | 808 | ||
809 | /* Wait queue to wait for reserved buffer credits to drop */ | ||
810 | wait_queue_head_t j_wait_reserved; | ||
811 | |||
796 | /* Semaphore for locking against concurrent checkpoints */ | 812 | /* Semaphore for locking against concurrent checkpoints */ |
797 | struct mutex j_checkpoint_mutex; | 813 | struct mutex j_checkpoint_mutex; |
798 | 814 | ||
@@ -847,6 +863,9 @@ struct journal_s | |||
847 | /* Total maximum capacity of the journal region on disk. */ | 863 | /* Total maximum capacity of the journal region on disk. */ |
848 | unsigned int j_maxlen; | 864 | unsigned int j_maxlen; |
849 | 865 | ||
866 | /* Number of buffers reserved from the running transaction */ | ||
867 | atomic_t j_reserved_credits; | ||
868 | |||
850 | /* | 869 | /* |
851 | * Protects the buffer lists and internal buffer state. | 870 | * Protects the buffer lists and internal buffer state. |
852 | */ | 871 | */ |
@@ -991,9 +1010,17 @@ extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, i | |||
991 | extern void __journal_free_buffer(struct journal_head *bh); | 1010 | extern void __journal_free_buffer(struct journal_head *bh); |
992 | extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); | 1011 | extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); |
993 | extern void __journal_clean_data_list(transaction_t *transaction); | 1012 | extern void __journal_clean_data_list(transaction_t *transaction); |
1013 | static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) | ||
1014 | { | ||
1015 | list_add_tail(&bh->b_assoc_buffers, head); | ||
1016 | } | ||
1017 | static inline void jbd2_unfile_log_bh(struct buffer_head *bh) | ||
1018 | { | ||
1019 | list_del_init(&bh->b_assoc_buffers); | ||
1020 | } | ||
994 | 1021 | ||
995 | /* Log buffer allocation */ | 1022 | /* Log buffer allocation */ |
996 | extern struct journal_head * jbd2_journal_get_descriptor_buffer(journal_t *); | 1023 | struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); |
997 | int jbd2_journal_next_log_block(journal_t *, unsigned long long *); | 1024 | int jbd2_journal_next_log_block(journal_t *, unsigned long long *); |
998 | int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, | 1025 | int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, |
999 | unsigned long *block); | 1026 | unsigned long *block); |
@@ -1039,11 +1066,10 @@ extern void jbd2_buffer_abort_trigger(struct journal_head *jh, | |||
1039 | struct jbd2_buffer_trigger_type *triggers); | 1066 | struct jbd2_buffer_trigger_type *triggers); |
1040 | 1067 | ||
1041 | /* Buffer IO */ | 1068 | /* Buffer IO */ |
1042 | extern int | 1069 | extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, |
1043 | jbd2_journal_write_metadata_buffer(transaction_t *transaction, | 1070 | struct journal_head *jh_in, |
1044 | struct journal_head *jh_in, | 1071 | struct buffer_head **bh_out, |
1045 | struct journal_head **jh_out, | 1072 | sector_t blocknr); |
1046 | unsigned long long blocknr); | ||
1047 | 1073 | ||
1048 | /* Transaction locking */ | 1074 | /* Transaction locking */ |
1049 | extern void __wait_on_journal (journal_t *); | 1075 | extern void __wait_on_journal (journal_t *); |
@@ -1076,10 +1102,14 @@ static inline handle_t *journal_current_handle(void) | |||
1076 | */ | 1102 | */ |
1077 | 1103 | ||
1078 | extern handle_t *jbd2_journal_start(journal_t *, int nblocks); | 1104 | extern handle_t *jbd2_journal_start(journal_t *, int nblocks); |
1079 | extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask, | 1105 | extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, |
1080 | unsigned int type, unsigned int line_no); | 1106 | gfp_t gfp_mask, unsigned int type, |
1107 | unsigned int line_no); | ||
1081 | extern int jbd2_journal_restart(handle_t *, int nblocks); | 1108 | extern int jbd2_journal_restart(handle_t *, int nblocks); |
1082 | extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask); | 1109 | extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask); |
1110 | extern int jbd2_journal_start_reserved(handle_t *handle, | ||
1111 | unsigned int type, unsigned int line_no); | ||
1112 | extern void jbd2_journal_free_reserved(handle_t *handle); | ||
1083 | extern int jbd2_journal_extend (handle_t *, int nblocks); | 1113 | extern int jbd2_journal_extend (handle_t *, int nblocks); |
1084 | extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); | 1114 | extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); |
1085 | extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); | 1115 | extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); |
@@ -1090,7 +1120,7 @@ extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); | |||
1090 | extern int jbd2_journal_forget (handle_t *, struct buffer_head *); | 1120 | extern int jbd2_journal_forget (handle_t *, struct buffer_head *); |
1091 | extern void journal_sync_buffer (struct buffer_head *); | 1121 | extern void journal_sync_buffer (struct buffer_head *); |
1092 | extern int jbd2_journal_invalidatepage(journal_t *, | 1122 | extern int jbd2_journal_invalidatepage(journal_t *, |
1093 | struct page *, unsigned long); | 1123 | struct page *, unsigned int, unsigned int); |
1094 | extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); | 1124 | extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); |
1095 | extern int jbd2_journal_stop(handle_t *); | 1125 | extern int jbd2_journal_stop(handle_t *); |
1096 | extern int jbd2_journal_flush (journal_t *); | 1126 | extern int jbd2_journal_flush (journal_t *); |
@@ -1125,6 +1155,7 @@ extern void jbd2_journal_ack_err (journal_t *); | |||
1125 | extern int jbd2_journal_clear_err (journal_t *); | 1155 | extern int jbd2_journal_clear_err (journal_t *); |
1126 | extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); | 1156 | extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); |
1127 | extern int jbd2_journal_force_commit(journal_t *); | 1157 | extern int jbd2_journal_force_commit(journal_t *); |
1158 | extern int jbd2_journal_force_commit_nested(journal_t *); | ||
1128 | extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); | 1159 | extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); |
1129 | extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, | 1160 | extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, |
1130 | struct jbd2_inode *inode, loff_t new_size); | 1161 | struct jbd2_inode *inode, loff_t new_size); |
@@ -1178,8 +1209,10 @@ extern int jbd2_journal_init_revoke_caches(void); | |||
1178 | extern void jbd2_journal_destroy_revoke(journal_t *); | 1209 | extern void jbd2_journal_destroy_revoke(journal_t *); |
1179 | extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); | 1210 | extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); |
1180 | extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); | 1211 | extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); |
1181 | extern void jbd2_journal_write_revoke_records(journal_t *, | 1212 | extern void jbd2_journal_write_revoke_records(journal_t *journal, |
1182 | transaction_t *, int); | 1213 | transaction_t *transaction, |
1214 | struct list_head *log_bufs, | ||
1215 | int write_op); | ||
1183 | 1216 | ||
1184 | /* Recovery revoke support */ | 1217 | /* Recovery revoke support */ |
1185 | extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); | 1218 | extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); |
@@ -1195,11 +1228,9 @@ extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); | |||
1195 | * transitions on demand. | 1228 | * transitions on demand. |
1196 | */ | 1229 | */ |
1197 | 1230 | ||
1198 | int __jbd2_log_space_left(journal_t *); /* Called with journal locked */ | ||
1199 | int jbd2_log_start_commit(journal_t *journal, tid_t tid); | 1231 | int jbd2_log_start_commit(journal_t *journal, tid_t tid); |
1200 | int __jbd2_log_start_commit(journal_t *journal, tid_t tid); | 1232 | int __jbd2_log_start_commit(journal_t *journal, tid_t tid); |
1201 | int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); | 1233 | int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); |
1202 | int jbd2_journal_force_commit_nested(journal_t *journal); | ||
1203 | int jbd2_log_wait_commit(journal_t *journal, tid_t tid); | 1234 | int jbd2_log_wait_commit(journal_t *journal, tid_t tid); |
1204 | int jbd2_complete_transaction(journal_t *journal, tid_t tid); | 1235 | int jbd2_complete_transaction(journal_t *journal, tid_t tid); |
1205 | int jbd2_log_do_checkpoint(journal_t *journal); | 1236 | int jbd2_log_do_checkpoint(journal_t *journal); |
@@ -1235,7 +1266,7 @@ static inline int is_journal_aborted(journal_t *journal) | |||
1235 | 1266 | ||
1236 | static inline int is_handle_aborted(handle_t *handle) | 1267 | static inline int is_handle_aborted(handle_t *handle) |
1237 | { | 1268 | { |
1238 | if (handle->h_aborted) | 1269 | if (handle->h_aborted || !handle->h_transaction) |
1239 | return 1; | 1270 | return 1; |
1240 | return is_journal_aborted(handle->h_transaction->t_journal); | 1271 | return is_journal_aborted(handle->h_transaction->t_journal); |
1241 | } | 1272 | } |
@@ -1266,16 +1297,37 @@ extern int jbd2_journal_blocks_per_page(struct inode *inode); | |||
1266 | extern size_t journal_tag_bytes(journal_t *journal); | 1297 | extern size_t journal_tag_bytes(journal_t *journal); |
1267 | 1298 | ||
1268 | /* | 1299 | /* |
1300 | * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for | ||
1301 | * transaction control blocks. | ||
1302 | */ | ||
1303 | #define JBD2_CONTROL_BLOCKS_SHIFT 5 | ||
1304 | |||
1305 | /* | ||
1269 | * Return the minimum number of blocks which must be free in the journal | 1306 | * Return the minimum number of blocks which must be free in the journal |
1270 | * before a new transaction may be started. Must be called under j_state_lock. | 1307 | * before a new transaction may be started. Must be called under j_state_lock. |
1271 | */ | 1308 | */ |
1272 | static inline int jbd_space_needed(journal_t *journal) | 1309 | static inline int jbd2_space_needed(journal_t *journal) |
1273 | { | 1310 | { |
1274 | int nblocks = journal->j_max_transaction_buffers; | 1311 | int nblocks = journal->j_max_transaction_buffers; |
1275 | if (journal->j_committing_transaction) | 1312 | return nblocks + (nblocks >> JBD2_CONTROL_BLOCKS_SHIFT); |
1276 | nblocks += atomic_read(&journal->j_committing_transaction-> | 1313 | } |
1277 | t_outstanding_credits); | 1314 | |
1278 | return nblocks; | 1315 | /* |
1316 | * Return number of free blocks in the log. Must be called under j_state_lock. | ||
1317 | */ | ||
1318 | static inline unsigned long jbd2_log_space_left(journal_t *journal) | ||
1319 | { | ||
1320 | /* Allow for rounding errors */ | ||
1321 | unsigned long free = journal->j_free - 32; | ||
1322 | |||
1323 | if (journal->j_committing_transaction) { | ||
1324 | unsigned long committing = atomic_read(&journal-> | ||
1325 | j_committing_transaction->t_outstanding_credits); | ||
1326 | |||
1327 | /* Transaction + control blocks */ | ||
1328 | free -= committing + (committing >> JBD2_CONTROL_BLOCKS_SHIFT); | ||
1329 | } | ||
1330 | return free; | ||
1279 | } | 1331 | } |
1280 | 1332 | ||
1281 | /* | 1333 | /* |
@@ -1286,11 +1338,9 @@ static inline int jbd_space_needed(journal_t *journal) | |||
1286 | #define BJ_None 0 /* Not journaled */ | 1338 | #define BJ_None 0 /* Not journaled */ |
1287 | #define BJ_Metadata 1 /* Normal journaled metadata */ | 1339 | #define BJ_Metadata 1 /* Normal journaled metadata */ |
1288 | #define BJ_Forget 2 /* Buffer superseded by this transaction */ | 1340 | #define BJ_Forget 2 /* Buffer superseded by this transaction */ |
1289 | #define BJ_IO 3 /* Buffer is for temporary IO use */ | 1341 | #define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ |
1290 | #define BJ_Shadow 4 /* Buffer contents being shadowed to the log */ | 1342 | #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ |
1291 | #define BJ_LogCtl 5 /* Buffer contains log descriptors */ | 1343 | #define BJ_Types 5 |
1292 | #define BJ_Reserved 6 /* Buffer is reserved for access by journal */ | ||
1293 | #define BJ_Types 7 | ||
1294 | 1344 | ||
1295 | extern int jbd_blocks_per_page(struct inode *inode); | 1345 | extern int jbd_blocks_per_page(struct inode *inode); |
1296 | 1346 | ||
@@ -1319,6 +1369,19 @@ static inline u32 jbd2_chksum(journal_t *journal, u32 crc, | |||
1319 | return *(u32 *)desc.ctx; | 1369 | return *(u32 *)desc.ctx; |
1320 | } | 1370 | } |
1321 | 1371 | ||
1372 | /* Return most recent uncommitted transaction */ | ||
1373 | static inline tid_t jbd2_get_latest_transaction(journal_t *journal) | ||
1374 | { | ||
1375 | tid_t tid; | ||
1376 | |||
1377 | read_lock(&journal->j_state_lock); | ||
1378 | tid = journal->j_commit_request; | ||
1379 | if (journal->j_running_transaction) | ||
1380 | tid = journal->j_running_transaction->t_tid; | ||
1381 | read_unlock(&journal->j_state_lock); | ||
1382 | return tid; | ||
1383 | } | ||
1384 | |||
1322 | #ifdef __KERNEL__ | 1385 | #ifdef __KERNEL__ |
1323 | 1386 | ||
1324 | #define buffer_trace_init(bh) do {} while (0) | 1387 | #define buffer_trace_init(bh) do {} while (0) |
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h index 6133679bc4c0..3dc53432355f 100644 --- a/include/linux/jbd_common.h +++ b/include/linux/jbd_common.h | |||
@@ -1,31 +1,7 @@ | |||
1 | #ifndef _LINUX_JBD_STATE_H | 1 | #ifndef _LINUX_JBD_STATE_H |
2 | #define _LINUX_JBD_STATE_H | 2 | #define _LINUX_JBD_STATE_H |
3 | 3 | ||
4 | enum jbd_state_bits { | 4 | #include <linux/bit_spinlock.h> |
5 | BH_JBD /* Has an attached ext3 journal_head */ | ||
6 | = BH_PrivateStart, | ||
7 | BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ | ||
8 | BH_Freed, /* Has been freed (truncated) */ | ||
9 | BH_Revoked, /* Has been revoked from the log */ | ||
10 | BH_RevokeValid, /* Revoked flag is valid */ | ||
11 | BH_JBDDirty, /* Is dirty but journaled */ | ||
12 | BH_State, /* Pins most journal_head state */ | ||
13 | BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ | ||
14 | BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ | ||
15 | BH_Verified, /* Metadata block has been verified ok */ | ||
16 | BH_JBDPrivateStart, /* First bit available for private use by FS */ | ||
17 | }; | ||
18 | |||
19 | BUFFER_FNS(JBD, jbd) | ||
20 | BUFFER_FNS(JWrite, jwrite) | ||
21 | BUFFER_FNS(JBDDirty, jbddirty) | ||
22 | TAS_BUFFER_FNS(JBDDirty, jbddirty) | ||
23 | BUFFER_FNS(Revoked, revoked) | ||
24 | TAS_BUFFER_FNS(Revoked, revoked) | ||
25 | BUFFER_FNS(RevokeValid, revokevalid) | ||
26 | TAS_BUFFER_FNS(RevokeValid, revokevalid) | ||
27 | BUFFER_FNS(Freed, freed) | ||
28 | BUFFER_FNS(Verified, verified) | ||
29 | 5 | ||
30 | static inline struct buffer_head *jh2bh(struct journal_head *jh) | 6 | static inline struct buffer_head *jh2bh(struct journal_head *jh) |
31 | { | 7 | { |
diff --git a/include/linux/mm.h b/include/linux/mm.h index e0c8528a41a4..66d881f1d576 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1041,7 +1041,8 @@ int get_kernel_page(unsigned long start, int write, struct page **pages); | |||
1041 | struct page *get_dump_page(unsigned long addr); | 1041 | struct page *get_dump_page(unsigned long addr); |
1042 | 1042 | ||
1043 | extern int try_to_release_page(struct page * page, gfp_t gfp_mask); | 1043 | extern int try_to_release_page(struct page * page, gfp_t gfp_mask); |
1044 | extern void do_invalidatepage(struct page *page, unsigned long offset); | 1044 | extern void do_invalidatepage(struct page *page, unsigned int offset, |
1045 | unsigned int length); | ||
1045 | 1046 | ||
1046 | int __set_page_dirty_nobuffers(struct page *page); | 1047 | int __set_page_dirty_nobuffers(struct page *page); |
1047 | int __set_page_dirty_no_writeback(struct page *page); | 1048 | int __set_page_dirty_no_writeback(struct page *page); |
diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h index 15d11a39be47..6797b9de90ed 100644 --- a/include/trace/events/ext3.h +++ b/include/trace/events/ext3.h | |||
@@ -290,13 +290,14 @@ DEFINE_EVENT(ext3__page_op, ext3_releasepage, | |||
290 | ); | 290 | ); |
291 | 291 | ||
292 | TRACE_EVENT(ext3_invalidatepage, | 292 | TRACE_EVENT(ext3_invalidatepage, |
293 | TP_PROTO(struct page *page, unsigned long offset), | 293 | TP_PROTO(struct page *page, unsigned int offset, unsigned int length), |
294 | 294 | ||
295 | TP_ARGS(page, offset), | 295 | TP_ARGS(page, offset, length), |
296 | 296 | ||
297 | TP_STRUCT__entry( | 297 | TP_STRUCT__entry( |
298 | __field( pgoff_t, index ) | 298 | __field( pgoff_t, index ) |
299 | __field( unsigned long, offset ) | 299 | __field( unsigned int, offset ) |
300 | __field( unsigned int, length ) | ||
300 | __field( ino_t, ino ) | 301 | __field( ino_t, ino ) |
301 | __field( dev_t, dev ) | 302 | __field( dev_t, dev ) |
302 | 303 | ||
@@ -305,14 +306,15 @@ TRACE_EVENT(ext3_invalidatepage, | |||
305 | TP_fast_assign( | 306 | TP_fast_assign( |
306 | __entry->index = page->index; | 307 | __entry->index = page->index; |
307 | __entry->offset = offset; | 308 | __entry->offset = offset; |
309 | __entry->length = length; | ||
308 | __entry->ino = page->mapping->host->i_ino; | 310 | __entry->ino = page->mapping->host->i_ino; |
309 | __entry->dev = page->mapping->host->i_sb->s_dev; | 311 | __entry->dev = page->mapping->host->i_sb->s_dev; |
310 | ), | 312 | ), |
311 | 313 | ||
312 | TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", | 314 | TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u", |
313 | MAJOR(__entry->dev), MINOR(__entry->dev), | 315 | MAJOR(__entry->dev), MINOR(__entry->dev), |
314 | (unsigned long) __entry->ino, | 316 | (unsigned long) __entry->ino, |
315 | __entry->index, __entry->offset) | 317 | __entry->index, __entry->offset, __entry->length) |
316 | ); | 318 | ); |
317 | 319 | ||
318 | TRACE_EVENT(ext3_discard_blocks, | 320 | TRACE_EVENT(ext3_discard_blocks, |
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 8ee15b97cd38..2068db241f22 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h | |||
@@ -19,6 +19,57 @@ struct extent_status; | |||
19 | 19 | ||
20 | #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) | 20 | #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) |
21 | 21 | ||
22 | #define show_mballoc_flags(flags) __print_flags(flags, "|", \ | ||
23 | { EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \ | ||
24 | { EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \ | ||
25 | { EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \ | ||
26 | { EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \ | ||
27 | { EXT4_MB_HINT_BEST, "HINT_BEST" }, \ | ||
28 | { EXT4_MB_HINT_DATA, "HINT_DATA" }, \ | ||
29 | { EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \ | ||
30 | { EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \ | ||
31 | { EXT4_MB_HINT_GOAL_ONLY, "HINT_GOAL_ONLY" }, \ | ||
32 | { EXT4_MB_HINT_TRY_GOAL, "HINT_TRY_GOAL" }, \ | ||
33 | { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ | ||
34 | { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ | ||
35 | { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ | ||
36 | { EXT4_MB_USE_RESERVED, "USE_RESV" }) | ||
37 | |||
38 | #define show_map_flags(flags) __print_flags(flags, "|", \ | ||
39 | { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ | ||
40 | { EXT4_GET_BLOCKS_UNINIT_EXT, "UNINIT" }, \ | ||
41 | { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ | ||
42 | { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ | ||
43 | { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ | ||
44 | { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ | ||
45 | { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ | ||
46 | { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ | ||
47 | { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }, \ | ||
48 | { EXT4_GET_BLOCKS_NO_PUT_HOLE, "NO_PUT_HOLE" }) | ||
49 | |||
50 | #define show_mflags(flags) __print_flags(flags, "", \ | ||
51 | { EXT4_MAP_NEW, "N" }, \ | ||
52 | { EXT4_MAP_MAPPED, "M" }, \ | ||
53 | { EXT4_MAP_UNWRITTEN, "U" }, \ | ||
54 | { EXT4_MAP_BOUNDARY, "B" }, \ | ||
55 | { EXT4_MAP_UNINIT, "u" }, \ | ||
56 | { EXT4_MAP_FROM_CLUSTER, "C" }) | ||
57 | |||
58 | #define show_free_flags(flags) __print_flags(flags, "|", \ | ||
59 | { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \ | ||
60 | { EXT4_FREE_BLOCKS_FORGET, "FORGET" }, \ | ||
61 | { EXT4_FREE_BLOCKS_VALIDATED, "VALIDATED" }, \ | ||
62 | { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE, "NO_QUOTA" }, \ | ||
63 | { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\ | ||
64 | { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER, "LAST_CLUSTER" }) | ||
65 | |||
66 | #define show_extent_status(status) __print_flags(status, "", \ | ||
67 | { (1 << 3), "W" }, \ | ||
68 | { (1 << 2), "U" }, \ | ||
69 | { (1 << 1), "D" }, \ | ||
70 | { (1 << 0), "H" }) | ||
71 | |||
72 | |||
22 | TRACE_EVENT(ext4_free_inode, | 73 | TRACE_EVENT(ext4_free_inode, |
23 | TP_PROTO(struct inode *inode), | 74 | TP_PROTO(struct inode *inode), |
24 | 75 | ||
@@ -281,7 +332,7 @@ DEFINE_EVENT(ext4__write_end, ext4_da_write_end, | |||
281 | TP_ARGS(inode, pos, len, copied) | 332 | TP_ARGS(inode, pos, len, copied) |
282 | ); | 333 | ); |
283 | 334 | ||
284 | TRACE_EVENT(ext4_da_writepages, | 335 | TRACE_EVENT(ext4_writepages, |
285 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), | 336 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), |
286 | 337 | ||
287 | TP_ARGS(inode, wbc), | 338 | TP_ARGS(inode, wbc), |
@@ -324,46 +375,62 @@ TRACE_EVENT(ext4_da_writepages, | |||
324 | ); | 375 | ); |
325 | 376 | ||
326 | TRACE_EVENT(ext4_da_write_pages, | 377 | TRACE_EVENT(ext4_da_write_pages, |
327 | TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), | 378 | TP_PROTO(struct inode *inode, pgoff_t first_page, |
379 | struct writeback_control *wbc), | ||
328 | 380 | ||
329 | TP_ARGS(inode, mpd), | 381 | TP_ARGS(inode, first_page, wbc), |
330 | 382 | ||
331 | TP_STRUCT__entry( | 383 | TP_STRUCT__entry( |
332 | __field( dev_t, dev ) | 384 | __field( dev_t, dev ) |
333 | __field( ino_t, ino ) | 385 | __field( ino_t, ino ) |
334 | __field( __u64, b_blocknr ) | 386 | __field( pgoff_t, first_page ) |
335 | __field( __u32, b_size ) | 387 | __field( long, nr_to_write ) |
336 | __field( __u32, b_state ) | 388 | __field( int, sync_mode ) |
337 | __field( unsigned long, first_page ) | ||
338 | __field( int, io_done ) | ||
339 | __field( int, pages_written ) | ||
340 | __field( int, sync_mode ) | ||
341 | ), | 389 | ), |
342 | 390 | ||
343 | TP_fast_assign( | 391 | TP_fast_assign( |
344 | __entry->dev = inode->i_sb->s_dev; | 392 | __entry->dev = inode->i_sb->s_dev; |
345 | __entry->ino = inode->i_ino; | 393 | __entry->ino = inode->i_ino; |
346 | __entry->b_blocknr = mpd->b_blocknr; | 394 | __entry->first_page = first_page; |
347 | __entry->b_size = mpd->b_size; | 395 | __entry->nr_to_write = wbc->nr_to_write; |
348 | __entry->b_state = mpd->b_state; | 396 | __entry->sync_mode = wbc->sync_mode; |
349 | __entry->first_page = mpd->first_page; | ||
350 | __entry->io_done = mpd->io_done; | ||
351 | __entry->pages_written = mpd->pages_written; | ||
352 | __entry->sync_mode = mpd->wbc->sync_mode; | ||
353 | ), | 397 | ), |
354 | 398 | ||
355 | TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x " | 399 | TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld " |
356 | "first_page %lu io_done %d pages_written %d sync_mode %d", | 400 | "sync_mode %d", |
357 | MAJOR(__entry->dev), MINOR(__entry->dev), | 401 | MAJOR(__entry->dev), MINOR(__entry->dev), |
358 | (unsigned long) __entry->ino, | 402 | (unsigned long) __entry->ino, __entry->first_page, |
359 | __entry->b_blocknr, __entry->b_size, | 403 | __entry->nr_to_write, __entry->sync_mode) |
360 | __entry->b_state, __entry->first_page, | ||
361 | __entry->io_done, __entry->pages_written, | ||
362 | __entry->sync_mode | ||
363 | ) | ||
364 | ); | 404 | ); |
365 | 405 | ||
366 | TRACE_EVENT(ext4_da_writepages_result, | 406 | TRACE_EVENT(ext4_da_write_pages_extent, |
407 | TP_PROTO(struct inode *inode, struct ext4_map_blocks *map), | ||
408 | |||
409 | TP_ARGS(inode, map), | ||
410 | |||
411 | TP_STRUCT__entry( | ||
412 | __field( dev_t, dev ) | ||
413 | __field( ino_t, ino ) | ||
414 | __field( __u64, lblk ) | ||
415 | __field( __u32, len ) | ||
416 | __field( __u32, flags ) | ||
417 | ), | ||
418 | |||
419 | TP_fast_assign( | ||
420 | __entry->dev = inode->i_sb->s_dev; | ||
421 | __entry->ino = inode->i_ino; | ||
422 | __entry->lblk = map->m_lblk; | ||
423 | __entry->len = map->m_len; | ||
424 | __entry->flags = map->m_flags; | ||
425 | ), | ||
426 | |||
427 | TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s", | ||
428 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
429 | (unsigned long) __entry->ino, __entry->lblk, __entry->len, | ||
430 | show_mflags(__entry->flags)) | ||
431 | ); | ||
432 | |||
433 | TRACE_EVENT(ext4_writepages_result, | ||
367 | TP_PROTO(struct inode *inode, struct writeback_control *wbc, | 434 | TP_PROTO(struct inode *inode, struct writeback_control *wbc, |
368 | int ret, int pages_written), | 435 | int ret, int pages_written), |
369 | 436 | ||
@@ -444,16 +511,16 @@ DEFINE_EVENT(ext4__page_op, ext4_releasepage, | |||
444 | ); | 511 | ); |
445 | 512 | ||
446 | DECLARE_EVENT_CLASS(ext4_invalidatepage_op, | 513 | DECLARE_EVENT_CLASS(ext4_invalidatepage_op, |
447 | TP_PROTO(struct page *page, unsigned long offset), | 514 | TP_PROTO(struct page *page, unsigned int offset, unsigned int length), |
448 | 515 | ||
449 | TP_ARGS(page, offset), | 516 | TP_ARGS(page, offset, length), |
450 | 517 | ||
451 | TP_STRUCT__entry( | 518 | TP_STRUCT__entry( |
452 | __field( dev_t, dev ) | 519 | __field( dev_t, dev ) |
453 | __field( ino_t, ino ) | 520 | __field( ino_t, ino ) |
454 | __field( pgoff_t, index ) | 521 | __field( pgoff_t, index ) |
455 | __field( unsigned long, offset ) | 522 | __field( unsigned int, offset ) |
456 | 523 | __field( unsigned int, length ) | |
457 | ), | 524 | ), |
458 | 525 | ||
459 | TP_fast_assign( | 526 | TP_fast_assign( |
@@ -461,24 +528,26 @@ DECLARE_EVENT_CLASS(ext4_invalidatepage_op, | |||
461 | __entry->ino = page->mapping->host->i_ino; | 528 | __entry->ino = page->mapping->host->i_ino; |
462 | __entry->index = page->index; | 529 | __entry->index = page->index; |
463 | __entry->offset = offset; | 530 | __entry->offset = offset; |
531 | __entry->length = length; | ||
464 | ), | 532 | ), |
465 | 533 | ||
466 | TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", | 534 | TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u", |
467 | MAJOR(__entry->dev), MINOR(__entry->dev), | 535 | MAJOR(__entry->dev), MINOR(__entry->dev), |
468 | (unsigned long) __entry->ino, | 536 | (unsigned long) __entry->ino, |
469 | (unsigned long) __entry->index, __entry->offset) | 537 | (unsigned long) __entry->index, |
538 | __entry->offset, __entry->length) | ||
470 | ); | 539 | ); |
471 | 540 | ||
472 | DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, | 541 | DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, |
473 | TP_PROTO(struct page *page, unsigned long offset), | 542 | TP_PROTO(struct page *page, unsigned int offset, unsigned int length), |
474 | 543 | ||
475 | TP_ARGS(page, offset) | 544 | TP_ARGS(page, offset, length) |
476 | ); | 545 | ); |
477 | 546 | ||
478 | DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, | 547 | DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, |
479 | TP_PROTO(struct page *page, unsigned long offset), | 548 | TP_PROTO(struct page *page, unsigned int offset, unsigned int length), |
480 | 549 | ||
481 | TP_ARGS(page, offset) | 550 | TP_ARGS(page, offset, length) |
482 | ); | 551 | ); |
483 | 552 | ||
484 | TRACE_EVENT(ext4_discard_blocks, | 553 | TRACE_EVENT(ext4_discard_blocks, |
@@ -673,10 +742,10 @@ TRACE_EVENT(ext4_request_blocks, | |||
673 | __entry->flags = ar->flags; | 742 | __entry->flags = ar->flags; |
674 | ), | 743 | ), |
675 | 744 | ||
676 | TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " | 745 | TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu " |
677 | "lleft %u lright %u pleft %llu pright %llu ", | 746 | "lleft %u lright %u pleft %llu pright %llu ", |
678 | MAJOR(__entry->dev), MINOR(__entry->dev), | 747 | MAJOR(__entry->dev), MINOR(__entry->dev), |
679 | (unsigned long) __entry->ino, __entry->flags, | 748 | (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), |
680 | __entry->len, __entry->logical, __entry->goal, | 749 | __entry->len, __entry->logical, __entry->goal, |
681 | __entry->lleft, __entry->lright, __entry->pleft, | 750 | __entry->lleft, __entry->lright, __entry->pleft, |
682 | __entry->pright) | 751 | __entry->pright) |
@@ -715,10 +784,10 @@ TRACE_EVENT(ext4_allocate_blocks, | |||
715 | __entry->flags = ar->flags; | 784 | __entry->flags = ar->flags; |
716 | ), | 785 | ), |
717 | 786 | ||
718 | TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " | 787 | TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u " |
719 | "goal %llu lleft %u lright %u pleft %llu pright %llu", | 788 | "goal %llu lleft %u lright %u pleft %llu pright %llu", |
720 | MAJOR(__entry->dev), MINOR(__entry->dev), | 789 | MAJOR(__entry->dev), MINOR(__entry->dev), |
721 | (unsigned long) __entry->ino, __entry->flags, | 790 | (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), |
722 | __entry->len, __entry->block, __entry->logical, | 791 | __entry->len, __entry->block, __entry->logical, |
723 | __entry->goal, __entry->lleft, __entry->lright, | 792 | __entry->goal, __entry->lleft, __entry->lright, |
724 | __entry->pleft, __entry->pright) | 793 | __entry->pleft, __entry->pright) |
@@ -748,11 +817,11 @@ TRACE_EVENT(ext4_free_blocks, | |||
748 | __entry->mode = inode->i_mode; | 817 | __entry->mode = inode->i_mode; |
749 | ), | 818 | ), |
750 | 819 | ||
751 | TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", | 820 | TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s", |
752 | MAJOR(__entry->dev), MINOR(__entry->dev), | 821 | MAJOR(__entry->dev), MINOR(__entry->dev), |
753 | (unsigned long) __entry->ino, | 822 | (unsigned long) __entry->ino, |
754 | __entry->mode, __entry->block, __entry->count, | 823 | __entry->mode, __entry->block, __entry->count, |
755 | __entry->flags) | 824 | show_free_flags(__entry->flags)) |
756 | ); | 825 | ); |
757 | 826 | ||
758 | TRACE_EVENT(ext4_sync_file_enter, | 827 | TRACE_EVENT(ext4_sync_file_enter, |
@@ -903,7 +972,7 @@ TRACE_EVENT(ext4_mballoc_alloc, | |||
903 | ), | 972 | ), |
904 | 973 | ||
905 | TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " | 974 | TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " |
906 | "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " | 975 | "result %u/%d/%u@%u blks %u grps %u cr %u flags %s " |
907 | "tail %u broken %u", | 976 | "tail %u broken %u", |
908 | MAJOR(__entry->dev), MINOR(__entry->dev), | 977 | MAJOR(__entry->dev), MINOR(__entry->dev), |
909 | (unsigned long) __entry->ino, | 978 | (unsigned long) __entry->ino, |
@@ -914,7 +983,7 @@ TRACE_EVENT(ext4_mballoc_alloc, | |||
914 | __entry->result_group, __entry->result_start, | 983 | __entry->result_group, __entry->result_start, |
915 | __entry->result_len, __entry->result_logical, | 984 | __entry->result_len, __entry->result_logical, |
916 | __entry->found, __entry->groups, __entry->cr, | 985 | __entry->found, __entry->groups, __entry->cr, |
917 | __entry->flags, __entry->tail, | 986 | show_mballoc_flags(__entry->flags), __entry->tail, |
918 | __entry->buddy ? 1 << __entry->buddy : 0) | 987 | __entry->buddy ? 1 << __entry->buddy : 0) |
919 | ); | 988 | ); |
920 | 989 | ||
@@ -1528,10 +1597,10 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_enter, | |||
1528 | __entry->flags = flags; | 1597 | __entry->flags = flags; |
1529 | ), | 1598 | ), |
1530 | 1599 | ||
1531 | TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u", | 1600 | TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s", |
1532 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1601 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1533 | (unsigned long) __entry->ino, | 1602 | (unsigned long) __entry->ino, |
1534 | __entry->lblk, __entry->len, __entry->flags) | 1603 | __entry->lblk, __entry->len, show_map_flags(__entry->flags)) |
1535 | ); | 1604 | ); |
1536 | 1605 | ||
1537 | DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, | 1606 | DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, |
@@ -1549,47 +1618,53 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, | |||
1549 | ); | 1618 | ); |
1550 | 1619 | ||
1551 | DECLARE_EVENT_CLASS(ext4__map_blocks_exit, | 1620 | DECLARE_EVENT_CLASS(ext4__map_blocks_exit, |
1552 | TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), | 1621 | TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, |
1622 | int ret), | ||
1553 | 1623 | ||
1554 | TP_ARGS(inode, map, ret), | 1624 | TP_ARGS(inode, flags, map, ret), |
1555 | 1625 | ||
1556 | TP_STRUCT__entry( | 1626 | TP_STRUCT__entry( |
1557 | __field( dev_t, dev ) | 1627 | __field( dev_t, dev ) |
1558 | __field( ino_t, ino ) | 1628 | __field( ino_t, ino ) |
1629 | __field( unsigned int, flags ) | ||
1559 | __field( ext4_fsblk_t, pblk ) | 1630 | __field( ext4_fsblk_t, pblk ) |
1560 | __field( ext4_lblk_t, lblk ) | 1631 | __field( ext4_lblk_t, lblk ) |
1561 | __field( unsigned int, len ) | 1632 | __field( unsigned int, len ) |
1562 | __field( unsigned int, flags ) | 1633 | __field( unsigned int, mflags ) |
1563 | __field( int, ret ) | 1634 | __field( int, ret ) |
1564 | ), | 1635 | ), |
1565 | 1636 | ||
1566 | TP_fast_assign( | 1637 | TP_fast_assign( |
1567 | __entry->dev = inode->i_sb->s_dev; | 1638 | __entry->dev = inode->i_sb->s_dev; |
1568 | __entry->ino = inode->i_ino; | 1639 | __entry->ino = inode->i_ino; |
1640 | __entry->flags = flags; | ||
1569 | __entry->pblk = map->m_pblk; | 1641 | __entry->pblk = map->m_pblk; |
1570 | __entry->lblk = map->m_lblk; | 1642 | __entry->lblk = map->m_lblk; |
1571 | __entry->len = map->m_len; | 1643 | __entry->len = map->m_len; |
1572 | __entry->flags = map->m_flags; | 1644 | __entry->mflags = map->m_flags; |
1573 | __entry->ret = ret; | 1645 | __entry->ret = ret; |
1574 | ), | 1646 | ), |
1575 | 1647 | ||
1576 | TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d", | 1648 | TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u " |
1649 | "mflags %s ret %d", | ||
1577 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1650 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1578 | (unsigned long) __entry->ino, | 1651 | (unsigned long) __entry->ino, |
1579 | __entry->lblk, __entry->pblk, | 1652 | show_map_flags(__entry->flags), __entry->lblk, __entry->pblk, |
1580 | __entry->len, __entry->flags, __entry->ret) | 1653 | __entry->len, show_mflags(__entry->mflags), __entry->ret) |
1581 | ); | 1654 | ); |
1582 | 1655 | ||
1583 | DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, | 1656 | DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, |
1584 | TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), | 1657 | TP_PROTO(struct inode *inode, unsigned flags, |
1658 | struct ext4_map_blocks *map, int ret), | ||
1585 | 1659 | ||
1586 | TP_ARGS(inode, map, ret) | 1660 | TP_ARGS(inode, flags, map, ret) |
1587 | ); | 1661 | ); |
1588 | 1662 | ||
1589 | DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, | 1663 | DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, |
1590 | TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), | 1664 | TP_PROTO(struct inode *inode, unsigned flags, |
1665 | struct ext4_map_blocks *map, int ret), | ||
1591 | 1666 | ||
1592 | TP_ARGS(inode, map, ret) | 1667 | TP_ARGS(inode, flags, map, ret) |
1593 | ); | 1668 | ); |
1594 | 1669 | ||
1595 | TRACE_EVENT(ext4_ext_load_extent, | 1670 | TRACE_EVENT(ext4_ext_load_extent, |
@@ -1638,25 +1713,50 @@ TRACE_EVENT(ext4_load_inode, | |||
1638 | ); | 1713 | ); |
1639 | 1714 | ||
1640 | TRACE_EVENT(ext4_journal_start, | 1715 | TRACE_EVENT(ext4_journal_start, |
1641 | TP_PROTO(struct super_block *sb, int nblocks, unsigned long IP), | 1716 | TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks, |
1717 | unsigned long IP), | ||
1642 | 1718 | ||
1643 | TP_ARGS(sb, nblocks, IP), | 1719 | TP_ARGS(sb, blocks, rsv_blocks, IP), |
1644 | 1720 | ||
1645 | TP_STRUCT__entry( | 1721 | TP_STRUCT__entry( |
1646 | __field( dev_t, dev ) | 1722 | __field( dev_t, dev ) |
1647 | __field(unsigned long, ip ) | 1723 | __field(unsigned long, ip ) |
1648 | __field( int, nblocks ) | 1724 | __field( int, blocks ) |
1725 | __field( int, rsv_blocks ) | ||
1649 | ), | 1726 | ), |
1650 | 1727 | ||
1651 | TP_fast_assign( | 1728 | TP_fast_assign( |
1652 | __entry->dev = sb->s_dev; | 1729 | __entry->dev = sb->s_dev; |
1653 | __entry->ip = IP; | 1730 | __entry->ip = IP; |
1654 | __entry->nblocks = nblocks; | 1731 | __entry->blocks = blocks; |
1732 | __entry->rsv_blocks = rsv_blocks; | ||
1655 | ), | 1733 | ), |
1656 | 1734 | ||
1657 | TP_printk("dev %d,%d nblocks %d caller %pF", | 1735 | TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pF", |
1658 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1736 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1659 | __entry->nblocks, (void *)__entry->ip) | 1737 | __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip) |
1738 | ); | ||
1739 | |||
1740 | TRACE_EVENT(ext4_journal_start_reserved, | ||
1741 | TP_PROTO(struct super_block *sb, int blocks, unsigned long IP), | ||
1742 | |||
1743 | TP_ARGS(sb, blocks, IP), | ||
1744 | |||
1745 | TP_STRUCT__entry( | ||
1746 | __field( dev_t, dev ) | ||
1747 | __field(unsigned long, ip ) | ||
1748 | __field( int, blocks ) | ||
1749 | ), | ||
1750 | |||
1751 | TP_fast_assign( | ||
1752 | __entry->dev = sb->s_dev; | ||
1753 | __entry->ip = IP; | ||
1754 | __entry->blocks = blocks; | ||
1755 | ), | ||
1756 | |||
1757 | TP_printk("dev %d,%d blocks, %d caller %pF", | ||
1758 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
1759 | __entry->blocks, (void *)__entry->ip) | ||
1660 | ); | 1760 | ); |
1661 | 1761 | ||
1662 | DECLARE_EVENT_CLASS(ext4__trim, | 1762 | DECLARE_EVENT_CLASS(ext4__trim, |
@@ -1736,12 +1836,12 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, | |||
1736 | __entry->newblk = newblock; | 1836 | __entry->newblk = newblock; |
1737 | ), | 1837 | ), |
1738 | 1838 | ||
1739 | TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x " | 1839 | TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s " |
1740 | "allocated %d newblock %llu", | 1840 | "allocated %d newblock %llu", |
1741 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1841 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1742 | (unsigned long) __entry->ino, | 1842 | (unsigned long) __entry->ino, |
1743 | (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, | 1843 | (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, |
1744 | __entry->len, __entry->flags, | 1844 | __entry->len, show_map_flags(__entry->flags), |
1745 | (unsigned int) __entry->allocated, | 1845 | (unsigned int) __entry->allocated, |
1746 | (unsigned long long) __entry->newblk) | 1846 | (unsigned long long) __entry->newblk) |
1747 | ); | 1847 | ); |
@@ -1769,10 +1869,10 @@ TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, | |||
1769 | __entry->ret = ret; | 1869 | __entry->ret = ret; |
1770 | ), | 1870 | ), |
1771 | 1871 | ||
1772 | TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d", | 1872 | TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d", |
1773 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1873 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1774 | __entry->lblk, (unsigned long long) __entry->pblk, | 1874 | __entry->lblk, (unsigned long long) __entry->pblk, |
1775 | __entry->len, __entry->flags, __entry->ret) | 1875 | __entry->len, show_mflags(__entry->flags), __entry->ret) |
1776 | ); | 1876 | ); |
1777 | 1877 | ||
1778 | TRACE_EVENT(ext4_ext_put_in_cache, | 1878 | TRACE_EVENT(ext4_ext_put_in_cache, |
@@ -1926,7 +2026,7 @@ TRACE_EVENT(ext4_ext_show_extent, | |||
1926 | TRACE_EVENT(ext4_remove_blocks, | 2026 | TRACE_EVENT(ext4_remove_blocks, |
1927 | TP_PROTO(struct inode *inode, struct ext4_extent *ex, | 2027 | TP_PROTO(struct inode *inode, struct ext4_extent *ex, |
1928 | ext4_lblk_t from, ext4_fsblk_t to, | 2028 | ext4_lblk_t from, ext4_fsblk_t to, |
1929 | ext4_fsblk_t partial_cluster), | 2029 | long long partial_cluster), |
1930 | 2030 | ||
1931 | TP_ARGS(inode, ex, from, to, partial_cluster), | 2031 | TP_ARGS(inode, ex, from, to, partial_cluster), |
1932 | 2032 | ||
@@ -1935,7 +2035,7 @@ TRACE_EVENT(ext4_remove_blocks, | |||
1935 | __field( ino_t, ino ) | 2035 | __field( ino_t, ino ) |
1936 | __field( ext4_lblk_t, from ) | 2036 | __field( ext4_lblk_t, from ) |
1937 | __field( ext4_lblk_t, to ) | 2037 | __field( ext4_lblk_t, to ) |
1938 | __field( ext4_fsblk_t, partial ) | 2038 | __field( long long, partial ) |
1939 | __field( ext4_fsblk_t, ee_pblk ) | 2039 | __field( ext4_fsblk_t, ee_pblk ) |
1940 | __field( ext4_lblk_t, ee_lblk ) | 2040 | __field( ext4_lblk_t, ee_lblk ) |
1941 | __field( unsigned short, ee_len ) | 2041 | __field( unsigned short, ee_len ) |
@@ -1953,7 +2053,7 @@ TRACE_EVENT(ext4_remove_blocks, | |||
1953 | ), | 2053 | ), |
1954 | 2054 | ||
1955 | TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" | 2055 | TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" |
1956 | "from %u to %u partial_cluster %u", | 2056 | "from %u to %u partial_cluster %lld", |
1957 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2057 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1958 | (unsigned long) __entry->ino, | 2058 | (unsigned long) __entry->ino, |
1959 | (unsigned) __entry->ee_lblk, | 2059 | (unsigned) __entry->ee_lblk, |
@@ -1961,19 +2061,20 @@ TRACE_EVENT(ext4_remove_blocks, | |||
1961 | (unsigned short) __entry->ee_len, | 2061 | (unsigned short) __entry->ee_len, |
1962 | (unsigned) __entry->from, | 2062 | (unsigned) __entry->from, |
1963 | (unsigned) __entry->to, | 2063 | (unsigned) __entry->to, |
1964 | (unsigned) __entry->partial) | 2064 | (long long) __entry->partial) |
1965 | ); | 2065 | ); |
1966 | 2066 | ||
1967 | TRACE_EVENT(ext4_ext_rm_leaf, | 2067 | TRACE_EVENT(ext4_ext_rm_leaf, |
1968 | TP_PROTO(struct inode *inode, ext4_lblk_t start, | 2068 | TP_PROTO(struct inode *inode, ext4_lblk_t start, |
1969 | struct ext4_extent *ex, ext4_fsblk_t partial_cluster), | 2069 | struct ext4_extent *ex, |
2070 | long long partial_cluster), | ||
1970 | 2071 | ||
1971 | TP_ARGS(inode, start, ex, partial_cluster), | 2072 | TP_ARGS(inode, start, ex, partial_cluster), |
1972 | 2073 | ||
1973 | TP_STRUCT__entry( | 2074 | TP_STRUCT__entry( |
1974 | __field( dev_t, dev ) | 2075 | __field( dev_t, dev ) |
1975 | __field( ino_t, ino ) | 2076 | __field( ino_t, ino ) |
1976 | __field( ext4_fsblk_t, partial ) | 2077 | __field( long long, partial ) |
1977 | __field( ext4_lblk_t, start ) | 2078 | __field( ext4_lblk_t, start ) |
1978 | __field( ext4_lblk_t, ee_lblk ) | 2079 | __field( ext4_lblk_t, ee_lblk ) |
1979 | __field( ext4_fsblk_t, ee_pblk ) | 2080 | __field( ext4_fsblk_t, ee_pblk ) |
@@ -1991,14 +2092,14 @@ TRACE_EVENT(ext4_ext_rm_leaf, | |||
1991 | ), | 2092 | ), |
1992 | 2093 | ||
1993 | TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" | 2094 | TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" |
1994 | "partial_cluster %u", | 2095 | "partial_cluster %lld", |
1995 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2096 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1996 | (unsigned long) __entry->ino, | 2097 | (unsigned long) __entry->ino, |
1997 | (unsigned) __entry->start, | 2098 | (unsigned) __entry->start, |
1998 | (unsigned) __entry->ee_lblk, | 2099 | (unsigned) __entry->ee_lblk, |
1999 | (unsigned long long) __entry->ee_pblk, | 2100 | (unsigned long long) __entry->ee_pblk, |
2000 | (unsigned short) __entry->ee_len, | 2101 | (unsigned short) __entry->ee_len, |
2001 | (unsigned) __entry->partial) | 2102 | (long long) __entry->partial) |
2002 | ); | 2103 | ); |
2003 | 2104 | ||
2004 | TRACE_EVENT(ext4_ext_rm_idx, | 2105 | TRACE_EVENT(ext4_ext_rm_idx, |
@@ -2025,14 +2126,16 @@ TRACE_EVENT(ext4_ext_rm_idx, | |||
2025 | ); | 2126 | ); |
2026 | 2127 | ||
2027 | TRACE_EVENT(ext4_ext_remove_space, | 2128 | TRACE_EVENT(ext4_ext_remove_space, |
2028 | TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth), | 2129 | TP_PROTO(struct inode *inode, ext4_lblk_t start, |
2130 | ext4_lblk_t end, int depth), | ||
2029 | 2131 | ||
2030 | TP_ARGS(inode, start, depth), | 2132 | TP_ARGS(inode, start, end, depth), |
2031 | 2133 | ||
2032 | TP_STRUCT__entry( | 2134 | TP_STRUCT__entry( |
2033 | __field( dev_t, dev ) | 2135 | __field( dev_t, dev ) |
2034 | __field( ino_t, ino ) | 2136 | __field( ino_t, ino ) |
2035 | __field( ext4_lblk_t, start ) | 2137 | __field( ext4_lblk_t, start ) |
2138 | __field( ext4_lblk_t, end ) | ||
2036 | __field( int, depth ) | 2139 | __field( int, depth ) |
2037 | ), | 2140 | ), |
2038 | 2141 | ||
@@ -2040,28 +2143,31 @@ TRACE_EVENT(ext4_ext_remove_space, | |||
2040 | __entry->dev = inode->i_sb->s_dev; | 2143 | __entry->dev = inode->i_sb->s_dev; |
2041 | __entry->ino = inode->i_ino; | 2144 | __entry->ino = inode->i_ino; |
2042 | __entry->start = start; | 2145 | __entry->start = start; |
2146 | __entry->end = end; | ||
2043 | __entry->depth = depth; | 2147 | __entry->depth = depth; |
2044 | ), | 2148 | ), |
2045 | 2149 | ||
2046 | TP_printk("dev %d,%d ino %lu since %u depth %d", | 2150 | TP_printk("dev %d,%d ino %lu since %u end %u depth %d", |
2047 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2151 | MAJOR(__entry->dev), MINOR(__entry->dev), |
2048 | (unsigned long) __entry->ino, | 2152 | (unsigned long) __entry->ino, |
2049 | (unsigned) __entry->start, | 2153 | (unsigned) __entry->start, |
2154 | (unsigned) __entry->end, | ||
2050 | __entry->depth) | 2155 | __entry->depth) |
2051 | ); | 2156 | ); |
2052 | 2157 | ||
2053 | TRACE_EVENT(ext4_ext_remove_space_done, | 2158 | TRACE_EVENT(ext4_ext_remove_space_done, |
2054 | TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth, | 2159 | TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, |
2055 | ext4_lblk_t partial, __le16 eh_entries), | 2160 | int depth, long long partial, __le16 eh_entries), |
2056 | 2161 | ||
2057 | TP_ARGS(inode, start, depth, partial, eh_entries), | 2162 | TP_ARGS(inode, start, end, depth, partial, eh_entries), |
2058 | 2163 | ||
2059 | TP_STRUCT__entry( | 2164 | TP_STRUCT__entry( |
2060 | __field( dev_t, dev ) | 2165 | __field( dev_t, dev ) |
2061 | __field( ino_t, ino ) | 2166 | __field( ino_t, ino ) |
2062 | __field( ext4_lblk_t, start ) | 2167 | __field( ext4_lblk_t, start ) |
2168 | __field( ext4_lblk_t, end ) | ||
2063 | __field( int, depth ) | 2169 | __field( int, depth ) |
2064 | __field( ext4_lblk_t, partial ) | 2170 | __field( long long, partial ) |
2065 | __field( unsigned short, eh_entries ) | 2171 | __field( unsigned short, eh_entries ) |
2066 | ), | 2172 | ), |
2067 | 2173 | ||
@@ -2069,18 +2175,20 @@ TRACE_EVENT(ext4_ext_remove_space_done, | |||
2069 | __entry->dev = inode->i_sb->s_dev; | 2175 | __entry->dev = inode->i_sb->s_dev; |
2070 | __entry->ino = inode->i_ino; | 2176 | __entry->ino = inode->i_ino; |
2071 | __entry->start = start; | 2177 | __entry->start = start; |
2178 | __entry->end = end; | ||
2072 | __entry->depth = depth; | 2179 | __entry->depth = depth; |
2073 | __entry->partial = partial; | 2180 | __entry->partial = partial; |
2074 | __entry->eh_entries = le16_to_cpu(eh_entries); | 2181 | __entry->eh_entries = le16_to_cpu(eh_entries); |
2075 | ), | 2182 | ), |
2076 | 2183 | ||
2077 | TP_printk("dev %d,%d ino %lu since %u depth %d partial %u " | 2184 | TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " |
2078 | "remaining_entries %u", | 2185 | "remaining_entries %u", |
2079 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2186 | MAJOR(__entry->dev), MINOR(__entry->dev), |
2080 | (unsigned long) __entry->ino, | 2187 | (unsigned long) __entry->ino, |
2081 | (unsigned) __entry->start, | 2188 | (unsigned) __entry->start, |
2189 | (unsigned) __entry->end, | ||
2082 | __entry->depth, | 2190 | __entry->depth, |
2083 | (unsigned) __entry->partial, | 2191 | (long long) __entry->partial, |
2084 | (unsigned short) __entry->eh_entries) | 2192 | (unsigned short) __entry->eh_entries) |
2085 | ); | 2193 | ); |
2086 | 2194 | ||
@@ -2095,7 +2203,7 @@ TRACE_EVENT(ext4_es_insert_extent, | |||
2095 | __field( ext4_lblk_t, lblk ) | 2203 | __field( ext4_lblk_t, lblk ) |
2096 | __field( ext4_lblk_t, len ) | 2204 | __field( ext4_lblk_t, len ) |
2097 | __field( ext4_fsblk_t, pblk ) | 2205 | __field( ext4_fsblk_t, pblk ) |
2098 | __field( unsigned long long, status ) | 2206 | __field( char, status ) |
2099 | ), | 2207 | ), |
2100 | 2208 | ||
2101 | TP_fast_assign( | 2209 | TP_fast_assign( |
@@ -2104,14 +2212,14 @@ TRACE_EVENT(ext4_es_insert_extent, | |||
2104 | __entry->lblk = es->es_lblk; | 2212 | __entry->lblk = es->es_lblk; |
2105 | __entry->len = es->es_len; | 2213 | __entry->len = es->es_len; |
2106 | __entry->pblk = ext4_es_pblock(es); | 2214 | __entry->pblk = ext4_es_pblock(es); |
2107 | __entry->status = ext4_es_status(es); | 2215 | __entry->status = ext4_es_status(es) >> 60; |
2108 | ), | 2216 | ), |
2109 | 2217 | ||
2110 | TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx", | 2218 | TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", |
2111 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2219 | MAJOR(__entry->dev), MINOR(__entry->dev), |
2112 | (unsigned long) __entry->ino, | 2220 | (unsigned long) __entry->ino, |
2113 | __entry->lblk, __entry->len, | 2221 | __entry->lblk, __entry->len, |
2114 | __entry->pblk, __entry->status) | 2222 | __entry->pblk, show_extent_status(__entry->status)) |
2115 | ); | 2223 | ); |
2116 | 2224 | ||
2117 | TRACE_EVENT(ext4_es_remove_extent, | 2225 | TRACE_EVENT(ext4_es_remove_extent, |
@@ -2172,7 +2280,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, | |||
2172 | __field( ext4_lblk_t, lblk ) | 2280 | __field( ext4_lblk_t, lblk ) |
2173 | __field( ext4_lblk_t, len ) | 2281 | __field( ext4_lblk_t, len ) |
2174 | __field( ext4_fsblk_t, pblk ) | 2282 | __field( ext4_fsblk_t, pblk ) |
2175 | __field( unsigned long long, status ) | 2283 | __field( char, status ) |
2176 | ), | 2284 | ), |
2177 | 2285 | ||
2178 | TP_fast_assign( | 2286 | TP_fast_assign( |
@@ -2181,14 +2289,14 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, | |||
2181 | __entry->lblk = es->es_lblk; | 2289 | __entry->lblk = es->es_lblk; |
2182 | __entry->len = es->es_len; | 2290 | __entry->len = es->es_len; |
2183 | __entry->pblk = ext4_es_pblock(es); | 2291 | __entry->pblk = ext4_es_pblock(es); |
2184 | __entry->status = ext4_es_status(es); | 2292 | __entry->status = ext4_es_status(es) >> 60; |
2185 | ), | 2293 | ), |
2186 | 2294 | ||
2187 | TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx", | 2295 | TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", |
2188 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2296 | MAJOR(__entry->dev), MINOR(__entry->dev), |
2189 | (unsigned long) __entry->ino, | 2297 | (unsigned long) __entry->ino, |
2190 | __entry->lblk, __entry->len, | 2298 | __entry->lblk, __entry->len, |
2191 | __entry->pblk, __entry->status) | 2299 | __entry->pblk, show_extent_status(__entry->status)) |
2192 | ); | 2300 | ); |
2193 | 2301 | ||
2194 | TRACE_EVENT(ext4_es_lookup_extent_enter, | 2302 | TRACE_EVENT(ext4_es_lookup_extent_enter, |
@@ -2225,7 +2333,7 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, | |||
2225 | __field( ext4_lblk_t, lblk ) | 2333 | __field( ext4_lblk_t, lblk ) |
2226 | __field( ext4_lblk_t, len ) | 2334 | __field( ext4_lblk_t, len ) |
2227 | __field( ext4_fsblk_t, pblk ) | 2335 | __field( ext4_fsblk_t, pblk ) |
2228 | __field( unsigned long long, status ) | 2336 | __field( char, status ) |
2229 | __field( int, found ) | 2337 | __field( int, found ) |
2230 | ), | 2338 | ), |
2231 | 2339 | ||
@@ -2235,16 +2343,16 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, | |||
2235 | __entry->lblk = es->es_lblk; | 2343 | __entry->lblk = es->es_lblk; |
2236 | __entry->len = es->es_len; | 2344 | __entry->len = es->es_len; |
2237 | __entry->pblk = ext4_es_pblock(es); | 2345 | __entry->pblk = ext4_es_pblock(es); |
2238 | __entry->status = ext4_es_status(es); | 2346 | __entry->status = ext4_es_status(es) >> 60; |
2239 | __entry->found = found; | 2347 | __entry->found = found; |
2240 | ), | 2348 | ), |
2241 | 2349 | ||
2242 | TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %llx", | 2350 | TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s", |
2243 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2351 | MAJOR(__entry->dev), MINOR(__entry->dev), |
2244 | (unsigned long) __entry->ino, __entry->found, | 2352 | (unsigned long) __entry->ino, __entry->found, |
2245 | __entry->lblk, __entry->len, | 2353 | __entry->lblk, __entry->len, |
2246 | __entry->found ? __entry->pblk : 0, | 2354 | __entry->found ? __entry->pblk : 0, |
2247 | __entry->found ? __entry->status : 0) | 2355 | show_extent_status(__entry->found ? __entry->status : 0)) |
2248 | ); | 2356 | ); |
2249 | 2357 | ||
2250 | TRACE_EVENT(ext4_es_shrink_enter, | 2358 | TRACE_EVENT(ext4_es_shrink_enter, |
diff --git a/mm/readahead.c b/mm/readahead.c index daed28dd5830..829a77c62834 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -48,7 +48,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping, | |||
48 | if (!trylock_page(page)) | 48 | if (!trylock_page(page)) |
49 | BUG(); | 49 | BUG(); |
50 | page->mapping = mapping; | 50 | page->mapping = mapping; |
51 | do_invalidatepage(page, 0); | 51 | do_invalidatepage(page, 0, PAGE_CACHE_SIZE); |
52 | page->mapping = NULL; | 52 | page->mapping = NULL; |
53 | unlock_page(page); | 53 | unlock_page(page); |
54 | } | 54 | } |
diff --git a/mm/truncate.c b/mm/truncate.c index c75b736e54b7..e2e8a8a7eb9d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -26,7 +26,8 @@ | |||
26 | /** | 26 | /** |
27 | * do_invalidatepage - invalidate part or all of a page | 27 | * do_invalidatepage - invalidate part or all of a page |
28 | * @page: the page which is affected | 28 | * @page: the page which is affected |
29 | * @offset: the index of the truncation point | 29 | * @offset: start of the range to invalidate |
30 | * @length: length of the range to invalidate | ||
30 | * | 31 | * |
31 | * do_invalidatepage() is called when all or part of the page has become | 32 | * do_invalidatepage() is called when all or part of the page has become |
32 | * invalidated by a truncate operation. | 33 | * invalidated by a truncate operation. |
@@ -37,24 +38,18 @@ | |||
37 | * point. Because the caller is about to free (and possibly reuse) those | 38 | * point. Because the caller is about to free (and possibly reuse) those |
38 | * blocks on-disk. | 39 | * blocks on-disk. |
39 | */ | 40 | */ |
40 | void do_invalidatepage(struct page *page, unsigned long offset) | 41 | void do_invalidatepage(struct page *page, unsigned int offset, |
42 | unsigned int length) | ||
41 | { | 43 | { |
42 | void (*invalidatepage)(struct page *, unsigned long); | 44 | void (*invalidatepage)(struct page *, unsigned int, unsigned int); |
45 | |||
43 | invalidatepage = page->mapping->a_ops->invalidatepage; | 46 | invalidatepage = page->mapping->a_ops->invalidatepage; |
44 | #ifdef CONFIG_BLOCK | 47 | #ifdef CONFIG_BLOCK |
45 | if (!invalidatepage) | 48 | if (!invalidatepage) |
46 | invalidatepage = block_invalidatepage; | 49 | invalidatepage = block_invalidatepage; |
47 | #endif | 50 | #endif |
48 | if (invalidatepage) | 51 | if (invalidatepage) |
49 | (*invalidatepage)(page, offset); | 52 | (*invalidatepage)(page, offset, length); |
50 | } | ||
51 | |||
52 | static inline void truncate_partial_page(struct page *page, unsigned partial) | ||
53 | { | ||
54 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); | ||
55 | cleancache_invalidate_page(page->mapping, page); | ||
56 | if (page_has_private(page)) | ||
57 | do_invalidatepage(page, partial); | ||
58 | } | 53 | } |
59 | 54 | ||
60 | /* | 55 | /* |
@@ -103,7 +98,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
103 | return -EIO; | 98 | return -EIO; |
104 | 99 | ||
105 | if (page_has_private(page)) | 100 | if (page_has_private(page)) |
106 | do_invalidatepage(page, 0); | 101 | do_invalidatepage(page, 0, PAGE_CACHE_SIZE); |
107 | 102 | ||
108 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 103 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
109 | 104 | ||
@@ -185,11 +180,11 @@ int invalidate_inode_page(struct page *page) | |||
185 | * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets | 180 | * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets |
186 | * @mapping: mapping to truncate | 181 | * @mapping: mapping to truncate |
187 | * @lstart: offset from which to truncate | 182 | * @lstart: offset from which to truncate |
188 | * @lend: offset to which to truncate | 183 | * @lend: offset to which to truncate (inclusive) |
189 | * | 184 | * |
190 | * Truncate the page cache, removing the pages that are between | 185 | * Truncate the page cache, removing the pages that are between |
191 | * specified offsets (and zeroing out partial page | 186 | * specified offsets (and zeroing out partial pages |
192 | * (if lstart is not page aligned)). | 187 | * if lstart or lend + 1 is not page aligned). |
193 | * | 188 | * |
194 | * Truncate takes two passes - the first pass is nonblocking. It will not | 189 | * Truncate takes two passes - the first pass is nonblocking. It will not |
195 | * block on page locks and it will not block on writeback. The second pass | 190 | * block on page locks and it will not block on writeback. The second pass |
@@ -200,35 +195,58 @@ int invalidate_inode_page(struct page *page) | |||
200 | * We pass down the cache-hot hint to the page freeing code. Even if the | 195 | * We pass down the cache-hot hint to the page freeing code. Even if the |
201 | * mapping is large, it is probably the case that the final pages are the most | 196 | * mapping is large, it is probably the case that the final pages are the most |
202 | * recently touched, and freeing happens in ascending file offset order. | 197 | * recently touched, and freeing happens in ascending file offset order. |
198 | * | ||
199 | * Note that since ->invalidatepage() accepts range to invalidate | ||
200 | * truncate_inode_pages_range is able to handle cases where lend + 1 is not | ||
201 | * page aligned properly. | ||
203 | */ | 202 | */ |
204 | void truncate_inode_pages_range(struct address_space *mapping, | 203 | void truncate_inode_pages_range(struct address_space *mapping, |
205 | loff_t lstart, loff_t lend) | 204 | loff_t lstart, loff_t lend) |
206 | { | 205 | { |
207 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 206 | pgoff_t start; /* inclusive */ |
208 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 207 | pgoff_t end; /* exclusive */ |
209 | struct pagevec pvec; | 208 | unsigned int partial_start; /* inclusive */ |
210 | pgoff_t index; | 209 | unsigned int partial_end; /* exclusive */ |
211 | pgoff_t end; | 210 | struct pagevec pvec; |
212 | int i; | 211 | pgoff_t index; |
212 | int i; | ||
213 | 213 | ||
214 | cleancache_invalidate_inode(mapping); | 214 | cleancache_invalidate_inode(mapping); |
215 | if (mapping->nrpages == 0) | 215 | if (mapping->nrpages == 0) |
216 | return; | 216 | return; |
217 | 217 | ||
218 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); | 218 | /* Offsets within partial pages */ |
219 | end = (lend >> PAGE_CACHE_SHIFT); | 219 | partial_start = lstart & (PAGE_CACHE_SIZE - 1); |
220 | partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); | ||
221 | |||
222 | /* | ||
223 | * 'start' and 'end' always covers the range of pages to be fully | ||
224 | * truncated. Partial pages are covered with 'partial_start' at the | ||
225 | * start of the range and 'partial_end' at the end of the range. | ||
226 | * Note that 'end' is exclusive while 'lend' is inclusive. | ||
227 | */ | ||
228 | start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
229 | if (lend == -1) | ||
230 | /* | ||
231 | * lend == -1 indicates end-of-file so we have to set 'end' | ||
232 | * to the highest possible pgoff_t and since the type is | ||
233 | * unsigned we're using -1. | ||
234 | */ | ||
235 | end = -1; | ||
236 | else | ||
237 | end = (lend + 1) >> PAGE_CACHE_SHIFT; | ||
220 | 238 | ||
221 | pagevec_init(&pvec, 0); | 239 | pagevec_init(&pvec, 0); |
222 | index = start; | 240 | index = start; |
223 | while (index <= end && pagevec_lookup(&pvec, mapping, index, | 241 | while (index < end && pagevec_lookup(&pvec, mapping, index, |
224 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 242 | min(end - index, (pgoff_t)PAGEVEC_SIZE))) { |
225 | mem_cgroup_uncharge_start(); | 243 | mem_cgroup_uncharge_start(); |
226 | for (i = 0; i < pagevec_count(&pvec); i++) { | 244 | for (i = 0; i < pagevec_count(&pvec); i++) { |
227 | struct page *page = pvec.pages[i]; | 245 | struct page *page = pvec.pages[i]; |
228 | 246 | ||
229 | /* We rely upon deletion not changing page->index */ | 247 | /* We rely upon deletion not changing page->index */ |
230 | index = page->index; | 248 | index = page->index; |
231 | if (index > end) | 249 | if (index >= end) |
232 | break; | 250 | break; |
233 | 251 | ||
234 | if (!trylock_page(page)) | 252 | if (!trylock_page(page)) |
@@ -247,27 +265,56 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
247 | index++; | 265 | index++; |
248 | } | 266 | } |
249 | 267 | ||
250 | if (partial) { | 268 | if (partial_start) { |
251 | struct page *page = find_lock_page(mapping, start - 1); | 269 | struct page *page = find_lock_page(mapping, start - 1); |
252 | if (page) { | 270 | if (page) { |
271 | unsigned int top = PAGE_CACHE_SIZE; | ||
272 | if (start > end) { | ||
273 | /* Truncation within a single page */ | ||
274 | top = partial_end; | ||
275 | partial_end = 0; | ||
276 | } | ||
253 | wait_on_page_writeback(page); | 277 | wait_on_page_writeback(page); |
254 | truncate_partial_page(page, partial); | 278 | zero_user_segment(page, partial_start, top); |
279 | cleancache_invalidate_page(mapping, page); | ||
280 | if (page_has_private(page)) | ||
281 | do_invalidatepage(page, partial_start, | ||
282 | top - partial_start); | ||
255 | unlock_page(page); | 283 | unlock_page(page); |
256 | page_cache_release(page); | 284 | page_cache_release(page); |
257 | } | 285 | } |
258 | } | 286 | } |
287 | if (partial_end) { | ||
288 | struct page *page = find_lock_page(mapping, end); | ||
289 | if (page) { | ||
290 | wait_on_page_writeback(page); | ||
291 | zero_user_segment(page, 0, partial_end); | ||
292 | cleancache_invalidate_page(mapping, page); | ||
293 | if (page_has_private(page)) | ||
294 | do_invalidatepage(page, 0, | ||
295 | partial_end); | ||
296 | unlock_page(page); | ||
297 | page_cache_release(page); | ||
298 | } | ||
299 | } | ||
300 | /* | ||
301 | * If the truncation happened within a single page no pages | ||
302 | * will be released, just zeroed, so we can bail out now. | ||
303 | */ | ||
304 | if (start >= end) | ||
305 | return; | ||
259 | 306 | ||
260 | index = start; | 307 | index = start; |
261 | for ( ; ; ) { | 308 | for ( ; ; ) { |
262 | cond_resched(); | 309 | cond_resched(); |
263 | if (!pagevec_lookup(&pvec, mapping, index, | 310 | if (!pagevec_lookup(&pvec, mapping, index, |
264 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 311 | min(end - index, (pgoff_t)PAGEVEC_SIZE))) { |
265 | if (index == start) | 312 | if (index == start) |
266 | break; | 313 | break; |
267 | index = start; | 314 | index = start; |
268 | continue; | 315 | continue; |
269 | } | 316 | } |
270 | if (index == start && pvec.pages[0]->index > end) { | 317 | if (index == start && pvec.pages[0]->index >= end) { |
271 | pagevec_release(&pvec); | 318 | pagevec_release(&pvec); |
272 | break; | 319 | break; |
273 | } | 320 | } |
@@ -277,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
277 | 324 | ||
278 | /* We rely upon deletion not changing page->index */ | 325 | /* We rely upon deletion not changing page->index */ |
279 | index = page->index; | 326 | index = page->index; |
280 | if (index > end) | 327 | if (index >= end) |
281 | break; | 328 | break; |
282 | 329 | ||
283 | lock_page(page); | 330 | lock_page(page); |
@@ -598,10 +645,8 @@ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
598 | * This rounding is currently just for example: unmap_mapping_range | 645 | * This rounding is currently just for example: unmap_mapping_range |
599 | * expands its hole outwards, whereas we want it to contract the hole | 646 | * expands its hole outwards, whereas we want it to contract the hole |
600 | * inwards. However, existing callers of truncate_pagecache_range are | 647 | * inwards. However, existing callers of truncate_pagecache_range are |
601 | * doing their own page rounding first; and truncate_inode_pages_range | 648 | * doing their own page rounding first. Note that unmap_mapping_range |
602 | * currently BUGs if lend is not pagealigned-1 (it handles partial | 649 | * allows holelen 0 for all, and we allow lend -1 for end of file. |
603 | * page at start of hole, but not partial page at end of hole). Note | ||
604 | * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. | ||
605 | */ | 650 | */ |
606 | 651 | ||
607 | /* | 652 | /* |